Add curve25519 and poly1305 by @agl / @floodyberry

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-06 14:10:44 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-02-06 14:10:44 +0000
commit: 078ff20d758b7d3219f5c10735b3b1757db1eff0 (patch)
tree: 29fd7d08ccecba35ca961dd5e70406ca24504fcf
parent: 82f9e6dff521cd21d7d00939a8093ad00197c61b (diff)
download: rspamd-078ff20d758b7d3219f5c10735b3b1757db1eff0.tar.gz
rspamd-078ff20d758b7d3219f5c10735b3b1757db1eff0.zip
13 files changed, 2963 insertions, 1 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt
index 311de754e..cf87c2604 100644
--- a/src/libcryptobox/CMakeLists.txt
+++ b/src/libcryptobox/CMakeLists.txt
@@ -4,6 +4,7 @@ INCLUDE(AsmOp.cmake)
 TARGET_ARCHITECTURE(ARCH)
 
 SET(CHACHASRC chacha20/ref.c)
+SET(POLYSRC poly1305/poly1305-donna.c)
 
 # For now we support only x86_64 architecture with optimizations
 IF(${ARCH} STREQUAL "x86_64")
@@ -25,6 +26,11 @@ IF(${ARCH} STREQUAL "x86_64")
 	" "dollar macro convention")
 	CONFIGURE_FILE(platform_config.h.in platform_config.h)
 	INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+	SET(CURVESRC curve25519/curve25519-donna-c64.c)
+ELSEIF(${ARCH} STREQUAL "i386")
+	SET(CURVESRC curve25519/curve25519-donna.c)
+ELSE()
+	SET(CURVESRC curve25519/ref.c)
 ENDIF()
 
 IF(HAVE_AVX2)
@@ -39,7 +45,8 @@ ENDIF(HAVE_SSE2)
 
 SET(LIBCRYPTOBOXSRC cryptobox.c)
 
-ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} ${CHACHASRC})
+ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} 
+	${CHACHASRC} ${POLYSRC} ${CURVESRC})
 IF(NOT DEBIAN_BUILD)
 	SET_TARGET_PROPERTIES(rspamd-cryptobox PROPERTIES VERSION ${RSPAMD_VERSION})
 ENDIF(NOT DEBIAN_BUILD)
diff --git a/src/libcryptobox/curve25519/LICENSE.md b/src/libcryptobox/curve25519/LICENSE.md
new file mode 100644
index 000000000..33a3240ae
--- /dev/null
+++ b/src/libcryptobox/curve25519/LICENSE.md
@@ -0,0 +1,46 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+* Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+curve25519-donna: Curve25519 elliptic curve, public key function
+
+http://code.google.com/p/curve25519-donna/
+
+Adam Langley <agl@imperialviolet.org>
+
+Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+
+More information about curve25519 can be found here
+  http://cr.yp.to/ecdh.html
+
+djb's sample implementation of curve25519 is written in a special assembly
+language called qhasm and uses the floating point registers.
+
+This is, almost, a clean room reimplementation from the curve25519 paper. It
+uses many of the tricks described therein. Only the crecip function is taken
+from the sample implementation.
diff --git a/src/libcryptobox/curve25519/curve25519-donna-c64.c b/src/libcryptobox/curve25519/curve25519-donna-c64.c
new file mode 100644
index 000000000..2d693ee44
--- /dev/null
+++ b/src/libcryptobox/curve25519/curve25519-donna-c64.c
@@ -0,0 +1,500 @@
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Code released into the public domain.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ *   http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+typedef uint8_t u8;
+typedef uint64_t limb;
+typedef limb felem[5];
+// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit
+// platforms only as far as I know.
+typedef unsigned uint128_t __attribute__((mode(TI)));
+
+#undef force_inline
+#define force_inline __attribute__((always_inline))
+
+/* Sum two numbers: output += in */
+static inline void force_inline
+fsum (limb *output, const limb *in)
+{
+	output[0] += in[0];
+	output[1] += in[1];
+	output[2] += in[2];
+	output[3] += in[3];
+	output[4] += in[4];
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!)
+ *
+ * Assumes that out[i] < 2**52
+ * On return, out[i] < 2**55
+ */
+static inline void force_inline
+fdifference_backwards (felem out, const felem in)
+{
+	/* 152 is 19 << 3 */
+	static const limb two54m152 = (((limb) 1) << 54) - 152;
+	static const limb two54m8 = (((limb) 1) << 54) - 8;
+
+	out[0] = in[0] + two54m152 - out[0];
+	out[1] = in[1] + two54m8 - out[1];
+	out[2] = in[2] + two54m8 - out[2];
+	out[3] = in[3] + two54m8 - out[3];
+	out[4] = in[4] + two54m8 - out[4];
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static inline void force_inline
+fscalar_product (felem output, const felem in, const limb scalar)
+{
+	uint128_t a;
+
+	a = ((uint128_t) in[0]) * scalar;
+	output[0] = ((limb) a) & 0x7ffffffffffff;
+
+	a = ((uint128_t) in[1]) * scalar + ((limb) (a >> 51));
+	output[1] = ((limb) a) & 0x7ffffffffffff;
+
+	a = ((uint128_t) in[2]) * scalar + ((limb) (a >> 51));
+	output[2] = ((limb) a) & 0x7ffffffffffff;
+
+	a = ((uint128_t) in[3]) * scalar + ((limb) (a >> 51));
+	output[3] = ((limb) a) & 0x7ffffffffffff;
+
+	a = ((uint128_t) in[4]) * scalar + ((limb) (a >> 51));
+	output[4] = ((limb) a) & 0x7ffffffffffff;
+
+	output[0] += (a >> 51) * 19;
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * Assumes that in[i] < 2**55 and likewise for in2.
+ * On return, output[i] < 2**52
+ */
+static inline void force_inline
+fmul (felem output, const felem in2, const felem in)
+{
+	uint128_t t[5];
+	limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c;
+
+	r0 = in[0];
+	r1 = in[1];
+	r2 = in[2];
+	r3 = in[3];
+	r4 = in[4];
+
+	s0 = in2[0];
+	s1 = in2[1];
+	s2 = in2[2];
+	s3 = in2[3];
+	s4 = in2[4];
+
+	t[0] = ((uint128_t) r0) * s0;
+	t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0;
+	t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0
+			+ ((uint128_t) r1) * s1;
+	t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2
+			+ ((uint128_t) r2) * s1;
+	t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1
+			+ ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2;
+
+	r4 *= 19;
+	r1 *= 19;
+	r2 *= 19;
+	r3 *= 19;
+
+	t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4
+			+ ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2;
+	t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4
+			+ ((uint128_t) r3) * s3;
+	t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4;
+	t[3] += ((uint128_t) r4) * s4;
+
+	r0 = (limb) t[0] & 0x7ffffffffffff;
+	c = (limb) (t[0] >> 51);
+	t[1] += c;
+	r1 = (limb) t[1] & 0x7ffffffffffff;
+	c = (limb) (t[1] >> 51);
+	t[2] += c;
+	r2 = (limb) t[2] & 0x7ffffffffffff;
+	c = (limb) (t[2] >> 51);
+	t[3] += c;
+	r3 = (limb) t[3] & 0x7ffffffffffff;
+	c = (limb) (t[3] >> 51);
+	t[4] += c;
+	r4 = (limb) t[4] & 0x7ffffffffffff;
+	c = (limb) (t[4] >> 51);
+	r0 += c * 19;
+	c = r0 >> 51;
+	r0 = r0 & 0x7ffffffffffff;
+	r1 += c;
+	c = r1 >> 51;
+	r1 = r1 & 0x7ffffffffffff;
+	r2 += c;
+
+	output[0] = r0;
+	output[1] = r1;
+	output[2] = r2;
+	output[3] = r3;
+	output[4] = r4;
+}
+
+static inline void force_inline
+fsquare_times (felem output, const felem in, limb count)
+{
+	uint128_t t[5];
+	limb r0, r1, r2, r3, r4, c;
+	limb d0, d1, d2, d4, d419;
+
+	r0 = in[0];
+	r1 = in[1];
+	r2 = in[2];
+	r3 = in[3];
+	r4 = in[4];
+
+	do {
+		d0 = r0 * 2;
+		d1 = r1 * 2;
+		d2 = r2 * 2 * 19;
+		d419 = r4 * 19;
+		d4 = d419 * 2;
+
+		t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1
+				+ (((uint128_t) d2) * (r3));
+		t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2
+				+ (((uint128_t) r3) * (r3 * 19));
+		t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1
+				+ (((uint128_t) d4) * (r3));
+		t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2
+				+ (((uint128_t) r4) * (d419));
+		t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3
+				+ (((uint128_t) r2) * (r2));
+
+		r0 = (limb) t[0] & 0x7ffffffffffff;
+		c = (limb) (t[0] >> 51);
+		t[1] += c;
+		r1 = (limb) t[1] & 0x7ffffffffffff;
+		c = (limb) (t[1] >> 51);
+		t[2] += c;
+		r2 = (limb) t[2] & 0x7ffffffffffff;
+		c = (limb) (t[2] >> 51);
+		t[3] += c;
+		r3 = (limb) t[3] & 0x7ffffffffffff;
+		c = (limb) (t[3] >> 51);
+		t[4] += c;
+		r4 = (limb) t[4] & 0x7ffffffffffff;
+		c = (limb) (t[4] >> 51);
+		r0 += c * 19;
+		c = r0 >> 51;
+		r0 = r0 & 0x7ffffffffffff;
+		r1 += c;
+		c = r1 >> 51;
+		r1 = r1 & 0x7ffffffffffff;
+		r2 += c;
+	} while (--count);
+
+	output[0] = r0;
+	output[1] = r1;
+	output[2] = r2;
+	output[3] = r3;
+	output[4] = r4;
+}
+
+/* Load a little-endian 64-bit number  */
+static limb load_limb (const u8 *in)
+{
+	return ((limb) in[0]) | (((limb) in[1]) << 8) | (((limb) in[2]) << 16)
+			| (((limb) in[3]) << 24) | (((limb) in[4]) << 32)
+			| (((limb) in[5]) << 40) | (((limb) in[6]) << 48)
+			| (((limb) in[7]) << 56);
+}
+
+static void store_limb (u8 *out, limb in)
+{
+	out[0] = in & 0xff;
+	out[1] = (in >> 8) & 0xff;
+	out[2] = (in >> 16) & 0xff;
+	out[3] = (in >> 24) & 0xff;
+	out[4] = (in >> 32) & 0xff;
+	out[5] = (in >> 40) & 0xff;
+	out[6] = (in >> 48) & 0xff;
+	out[7] = (in >> 56) & 0xff;
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void fexpand (limb *output, const u8 *in)
+{
+	output[0] = load_limb (in) & 0x7ffffffffffff;
+	output[1] = (load_limb (in + 6) >> 3) & 0x7ffffffffffff;
+	output[2] = (load_limb (in + 12) >> 6) & 0x7ffffffffffff;
+	output[3] = (load_limb (in + 19) >> 1) & 0x7ffffffffffff;
+	output[4] = (load_limb (in + 24) >> 12) & 0x7ffffffffffff;
+}
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array
+ */
+static void fcontract (u8 *output, const felem input)
+{
+	uint128_t t[5];
+
+	t[0] = input[0];
+	t[1] = input[1];
+	t[2] = input[2];
+	t[3] = input[3];
+	t[4] = input[4];
+
+	t[1] += t[0] >> 51;
+	t[0] &= 0x7ffffffffffff;
+	t[2] += t[1] >> 51;
+	t[1] &= 0x7ffffffffffff;
+	t[3] += t[2] >> 51;
+	t[2] &= 0x7ffffffffffff;
+	t[4] += t[3] >> 51;
+	t[3] &= 0x7ffffffffffff;
+	t[0] += 19 * (t[4] >> 51);
+	t[4] &= 0x7ffffffffffff;
+
+	t[1] += t[0] >> 51;
+	t[0] &= 0x7ffffffffffff;
+	t[2] += t[1] >> 51;
+	t[1] &= 0x7ffffffffffff;
+	t[3] += t[2] >> 51;
+	t[2] &= 0x7ffffffffffff;
+	t[4] += t[3] >> 51;
+	t[3] &= 0x7ffffffffffff;
+	t[0] += 19 * (t[4] >> 51);
+	t[4] &= 0x7ffffffffffff;
+
+	/* now t is between 0 and 2^255-1, properly carried. */
+	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
+
+	t[0] += 19;
+
+	t[1] += t[0] >> 51;
+	t[0] &= 0x7ffffffffffff;
+	t[2] += t[1] >> 51;
+	t[1] &= 0x7ffffffffffff;
+	t[3] += t[2] >> 51;
+	t[2] &= 0x7ffffffffffff;
+	t[4] += t[3] >> 51;
+	t[3] &= 0x7ffffffffffff;
+	t[0] += 19 * (t[4] >> 51);
+	t[4] &= 0x7ffffffffffff;
+
+	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
+
+	t[0] += 0x8000000000000 - 19;
+	t[1] += 0x8000000000000 - 1;
+	t[2] += 0x8000000000000 - 1;
+	t[3] += 0x8000000000000 - 1;
+	t[4] += 0x8000000000000 - 1;
+
+	/* now between 2^255 and 2^256-20, and offset by 2^255. */
+
+	t[1] += t[0] >> 51;
+	t[0] &= 0x7ffffffffffff;
+	t[2] += t[1] >> 51;
+	t[1] &= 0x7ffffffffffff;
+	t[3] += t[2] >> 51;
+	t[2] &= 0x7ffffffffffff;
+	t[4] += t[3] >> 51;
+	t[3] &= 0x7ffffffffffff;
+	t[4] &= 0x7ffffffffffff;
+
+	store_limb (output, t[0] | (t[1] << 51));
+	store_limb (output + 8, (t[1] >> 13) | (t[2] << 38));
+	store_limb (output + 16, (t[2] >> 26) | (t[3] << 25));
+	store_limb (output + 24, (t[3] >> 39) | (t[4] << 12));
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ *   x2 z3: long form
+ *   x3 z3: long form
+ *   x z: short form, destroyed
+ *   xprime zprime: short form, destroyed
+ *   qmqp: short form, preserved
+ */
+static void fmonty (limb *x2, limb *z2, /* output 2Q */
+limb *x3, limb *z3, /* output Q + Q' */
+limb *x, limb *z, /* input Q */
+limb *xprime, limb *zprime, /* input Q' */
+const limb *qmqp /* input Q - Q' */)
+{
+	limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5],
+			zzzprime[5];
+
+	memcpy (origx, x, 5 * sizeof(limb));
+	fsum (x, z);
+	fdifference_backwards (z, origx);  // does x - z
+
+	memcpy (origxprime, xprime, sizeof(limb) * 5);
+	fsum (xprime, zprime);
+	fdifference_backwards (zprime, origxprime);
+	fmul (xxprime, xprime, z);
+	fmul (zzprime, x, zprime);
+	memcpy (origxprime, xxprime, sizeof(limb) * 5);
+	fsum (xxprime, zzprime);
+	fdifference_backwards (zzprime, origxprime);
+	fsquare_times (x3, xxprime, 1);
+	fsquare_times (zzzprime, zzprime, 1);
+	fmul (z3, zzzprime, qmqp);
+
+	fsquare_times (xx, x, 1);
+	fsquare_times (zz, z, 1);
+	fmul (x2, xx, zz);
+	fdifference_backwards (zz, xx);  // does zz = xx - zz
+	fscalar_product (zzz, zz, 121665);
+	fsum (zzz, xx);
+	fmul (z2, zz, zzz);
+}
+
+// -----------------------------------------------------------------------------
+// Maybe swap the contents of two limb arrays (@a and @b), each @len elements
+// long. Perform the swap iff @swap is non-zero.
+//
+// This function performs the swap without leaking any side-channel
+// information.
+// -----------------------------------------------------------------------------
+static void swap_conditional (limb a[5], limb b[5], limb iswap)
+{
+	unsigned i;
+	const limb swap = -iswap;
+
+	for (i = 0; i < 5; ++i) {
+		const limb x = swap & (a[i] ^ b[i]);
+		a[i] ^= x;
+		b[i] ^= x;
+	}
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ *   resultx/resultz: the x coordinate of the resulting curve point (short form)
+ *   n: a little endian, 32-byte number
+ *   q: a point of the curve (short form)
+ */
+static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q)
+{
+	limb a[5] = { 0 }, b[5] = { 1 }, c[5] = { 1 }, d[5] = { 0 };
+	limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+	limb e[5] = { 0 }, f[5] = { 1 }, g[5] = { 0 }, h[5] = { 1 };
+	limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+	unsigned i, j;
+
+	memcpy (nqpqx, q, sizeof(limb) * 5);
+
+	for (i = 0; i < 32; ++i) {
+		u8 byte = n[31 - i];
+		for (j = 0; j < 8; ++j) {
+			const limb bit = byte >> 7;
+
+			swap_conditional (nqx, nqpqx, bit);
+			swap_conditional (nqz, nqpqz, bit);
+			fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q);
+			swap_conditional (nqx2, nqpqx2, bit);
+			swap_conditional (nqz2, nqpqz2, bit);
+
+			t = nqx;
+			nqx = nqx2;
+			nqx2 = t;
+			t = nqz;
+			nqz = nqz2;
+			nqz2 = t;
+			t = nqpqx;
+			nqpqx = nqpqx2;
+			nqpqx2 = t;
+			t = nqpqz;
+			nqpqz = nqpqz2;
+			nqpqz2 = t;
+
+			byte <<= 1;
+		}
+	}
+
+	memcpy (resultx, nqx, sizeof(limb) * 5);
+	memcpy (resultz, nqz, sizeof(limb) * 5);
+}
+
+// -----------------------------------------------------------------------------
+// Shamelessly copied from djb's code, tightened a little
+// -----------------------------------------------------------------------------
+static void crecip (felem out, const felem z)
+{
+	felem a, t0, b, c;
+
+	/* 2 */fsquare_times (a, z, 1); // a = 2
+	/* 8 */fsquare_times (t0, a, 2);
+	/* 9 */fmul (b, t0, z); // b = 9
+	/* 11 */fmul (a, b, a); // a = 11
+	/* 22 */fsquare_times (t0, a, 1);
+	/* 2^5 - 2^0 = 31 */fmul (b, t0, b);
+	/* 2^10 - 2^5 */fsquare_times (t0, b, 5);
+	/* 2^10 - 2^0 */fmul (b, t0, b);
+	/* 2^20 - 2^10 */fsquare_times (t0, b, 10);
+	/* 2^20 - 2^0 */fmul (c, t0, b);
+	/* 2^40 - 2^20 */fsquare_times (t0, c, 20);
+	/* 2^40 - 2^0 */fmul (t0, t0, c);
+	/* 2^50 - 2^10 */fsquare_times (t0, t0, 10);
+	/* 2^50 - 2^0 */fmul (b, t0, b);
+	/* 2^100 - 2^50 */fsquare_times (t0, b, 50);
+	/* 2^100 - 2^0 */fmul (c, t0, b);
+	/* 2^200 - 2^100 */fsquare_times (t0, c, 100);
+	/* 2^200 - 2^0 */fmul (t0, t0, c);
+	/* 2^250 - 2^50 */fsquare_times (t0, t0, 50);
+	/* 2^250 - 2^0 */fmul (t0, t0, b);
+	/* 2^255 - 2^5 */fsquare_times (t0, t0, 5);
+	/* 2^255 - 21 */fmul (out, t0, a);
+}
+
+int curve25519 (u8 *, const u8 *, const u8 *);
+
+int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint)
+{
+	limb bp[5], x[5], z[5], zmone[5];
+	uint8_t e[32];
+	int i;
+
+	for (i = 0; i < 32; ++i)
+		e[i] = secret[i];
+	e[0] &= 248;
+	e[31] &= 127;
+	e[31] |= 64;
+
+	fexpand (bp, basepoint);
+	cmult (x, z, e, bp);
+	crecip (zmone, z);
+	fmul (z, x, zmone);
+	fcontract (mypublic, z);
+	return 0;
+}
diff --git a/src/libcryptobox/curve25519/curve25519-donna.c b/src/libcryptobox/curve25519/curve25519-donna.c
new file mode 100644
index 000000000..f9f19a632
--- /dev/null
+++ b/src/libcryptobox/curve25519/curve25519-donna.c
@@ -0,0 +1,912 @@
+/* Copyright 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * curve25519-donna: Curve25519 elliptic curve, public key function
+ *
+ * http://code.google.com/p/curve25519-donna/
+ *
+ * Adam Langley <agl@imperialviolet.org>
+ *
+ * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
+ *
+ * More information about curve25519 can be found here
+ *   http://cr.yp.to/ecdh.html
+ *
+ * djb's sample implementation of curve25519 is written in a special assembly
+ * language called qhasm and uses the floating point registers.
+ *
+ * This is, almost, a clean room reimplementation from the curve25519 paper. It
+ * uses many of the tricks described therein. Only the crecip function is taken
+ * from the sample implementation. */
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef uint8_t u8;
+typedef int32_t s32;
+typedef int64_t limb;
+
+/* Field element representation:
+ *
+ * Field elements are written as an array of signed, 64-bit limbs, least
+ * significant first. The value of the field element is:
+ *   x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
+ *
+ * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
+
+/* Sum two numbers: output += in */
+static void fsum (limb *output, const limb *in)
+{
+	unsigned i;
+	for (i = 0; i < 10; i += 2) {
+		output[0 + i] = output[0 + i] + in[0 + i];
+		output[1 + i] = output[1 + i] + in[1 + i];
+	}
+}
+
+/* Find the difference of two numbers: output = in - output
+ * (note the order of the arguments!). */
+static void fdifference (limb *output, const limb *in)
+{
+	unsigned i;
+	for (i = 0; i < 10; ++i) {
+		output[i] = in[i] - output[i];
+	}
+}
+
+/* Multiply a number by a scalar: output = in * scalar */
+static void fscalar_product (limb *output, const limb *in, const limb scalar)
+{
+	unsigned i;
+	for (i = 0; i < 10; ++i) {
+		output[i] = in[i] * scalar;
+	}
+}
+
+/* Multiply two numbers: output = in2 * in
+ *
+ * output must be distinct to both inputs. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * output[x] <= 14 * the largest product of the input limbs. */
+static void fproduct (limb *output, const limb *in2, const limb *in)
+{
+	output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
+	output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[1])) * ((s32) in[0]);
+	output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[0]);
+	output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[3])
+			+ ((limb) ((s32) in2[3])) * ((s32) in[0]);
+	output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2])
+			+ 2
+					* (((limb) ((s32) in2[1])) * ((s32) in[3])
+							+ ((limb) ((s32) in2[3])) * ((s32) in[1]))
+			+ ((limb) ((s32) in2[0])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[0]);
+	output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3])
+			+ ((limb) ((s32) in2[3])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[1])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[5])
+			+ ((limb) ((s32) in2[5])) * ((s32) in[0]);
+	output[6] = 2
+			* (((limb) ((s32) in2[3])) * ((s32) in[3])
+					+ ((limb) ((s32) in2[1])) * ((s32) in[5])
+					+ ((limb) ((s32) in2[5])) * ((s32) in[1]))
+			+ ((limb) ((s32) in2[2])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[0]);
+	output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[3])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[5])
+			+ ((limb) ((s32) in2[5])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[1])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[7])
+			+ ((limb) ((s32) in2[7])) * ((s32) in[0]);
+	output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4])
+			+ 2
+					* (((limb) ((s32) in2[3])) * ((s32) in[5])
+							+ ((limb) ((s32) in2[5])) * ((s32) in[3])
+							+ ((limb) ((s32) in2[1])) * ((s32) in[7])
+							+ ((limb) ((s32) in2[7])) * ((s32) in[1]))
+			+ ((limb) ((s32) in2[2])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[0]);
+	output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5])
+			+ ((limb) ((s32) in2[5])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[3])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[3])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[7])
+			+ ((limb) ((s32) in2[7])) * ((s32) in[2])
+			+ ((limb) ((s32) in2[1])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[1])
+			+ ((limb) ((s32) in2[0])) * ((s32) in[9])
+			+ ((limb) ((s32) in2[9])) * ((s32) in[0]);
+	output[10] = 2
+			* (((limb) ((s32) in2[5])) * ((s32) in[5])
+					+ ((limb) ((s32) in2[3])) * ((s32) in[7])
+					+ ((limb) ((s32) in2[7])) * ((s32) in[3])
+					+ ((limb) ((s32) in2[1])) * ((s32) in[9])
+					+ ((limb) ((s32) in2[9])) * ((s32) in[1]))
+			+ ((limb) ((s32) in2[4])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[2]);
+	output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[5])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[7])
+			+ ((limb) ((s32) in2[7])) * ((s32) in[4])
+			+ ((limb) ((s32) in2[3])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[3])
+			+ ((limb) ((s32) in2[2])) * ((s32) in[9])
+			+ ((limb) ((s32) in2[9])) * ((s32) in[2]);
+	output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6])
+			+ 2
+					* (((limb) ((s32) in2[5])) * ((s32) in[7])
+							+ ((limb) ((s32) in2[7])) * ((s32) in[5])
+							+ ((limb) ((s32) in2[3])) * ((s32) in[9])
+							+ ((limb) ((s32) in2[9])) * ((s32) in[3]))
+			+ ((limb) ((s32) in2[4])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[4]);
+	output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7])
+			+ ((limb) ((s32) in2[7])) * ((s32) in[6])
+			+ ((limb) ((s32) in2[5])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[5])
+			+ ((limb) ((s32) in2[4])) * ((s32) in[9])
+			+ ((limb) ((s32) in2[9])) * ((s32) in[4]);
+	output[14] = 2
+			* (((limb) ((s32) in2[7])) * ((s32) in[7])
+					+ ((limb) ((s32) in2[5])) * ((s32) in[9])
+					+ ((limb) ((s32) in2[9])) * ((s32) in[5]))
+			+ ((limb) ((s32) in2[6])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[6]);
+	output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8])
+			+ ((limb) ((s32) in2[8])) * ((s32) in[7])
+			+ ((limb) ((s32) in2[6])) * ((s32) in[9])
+			+ ((limb) ((s32) in2[9])) * ((s32) in[6]);
+	output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8])
+			+ 2
+					* (((limb) ((s32) in2[7])) * ((s32) in[9])
+							+ ((limb) ((s32) in2[9])) * ((s32) in[7]));
+	output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9])
+			+ ((limb) ((s32) in2[9])) * ((s32) in[8]);
+	output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
+}
+
+/* Reduce a long form to a short form by taking the input mod 2^255 - 19.
+ *
+ * On entry: |output[i]| < 14*2^54
+ * On exit: |output[0..8]| < 280*2^54 */
+static void freduce_degree (limb *output)
+{
+	/* Each of these shifts and adds ends up multiplying the value by 19.
+	 *
+	 * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
+	 * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
+	output[8] += output[18] << 4;
+	output[8] += output[18] << 1;
+	output[8] += output[18];
+	output[7] += output[17] << 4;
+	output[7] += output[17] << 1;
+	output[7] += output[17];
+	output[6] += output[16] << 4;
+	output[6] += output[16] << 1;
+	output[6] += output[16];
+	output[5] += output[15] << 4;
+	output[5] += output[15] << 1;
+	output[5] += output[15];
+	output[4] += output[14] << 4;
+	output[4] += output[14] << 1;
+	output[4] += output[14];
+	output[3] += output[13] << 4;
+	output[3] += output[13] << 1;
+	output[3] += output[13];
+	output[2] += output[12] << 4;
+	output[2] += output[12] << 1;
+	output[2] += output[12];
+	output[1] += output[11] << 4;
+	output[1] += output[11] << 1;
+	output[1] += output[11];
+	output[0] += output[10] << 4;
+	output[0] += output[10] << 1;
+	output[0] += output[10];
+}
+
+#if (-1 & 3) != 3
+#error "This code only works on a two's complement system"
+#endif
+
+/* return v / 2^26, using only shifts and adds.
+ *
+ * On entry: v can take any value. */
+static inline limb div_by_2_26 (const limb v)
+{
+	/* High word of v; no shift needed. */
+	const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+	/* Set to all 1s if v was negative; else set to 0s. */
+	const int32_t sign = ((int32_t) highword) >> 31;
+	/* Set to 0x3ffffff if v was negative; else set to 0. */
+	const int32_t roundoff = ((uint32_t) sign) >> 6;
+	/* Should return v / (1<<26) */
+	return (v + roundoff) >> 26;
+}
+
+/* return v / (2^25), using only shifts and adds.
+ *
+ * On entry: v can take any value. */
+static inline limb div_by_2_25 (const limb v)
+{
+	/* High word of v; no shift needed*/
+	const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
+	/* Set to all 1s if v was negative; else set to 0s. */
+	const int32_t sign = ((int32_t) highword) >> 31;
+	/* Set to 0x1ffffff if v was negative; else set to 0. */
+	const int32_t roundoff = ((uint32_t) sign) >> 7;
+	/* Should return v / (1<<25) */
+	return (v + roundoff) >> 25;
+}
+
+/* Reduce all coefficients of the short form input so that |x| < 2^26.
+ *
+ * On entry: |output[i]| < 280*2^54 */
+static void freduce_coefficients (limb *output)
+{
+	unsigned i;
+
+	output[10] = 0;
+
+	for (i = 0; i < 10; i += 2) {
+		limb over = div_by_2_26 (output[i]);
+		/* The entry condition (that |output[i]| < 280*2^54) means that over is, at
+		 * most, 280*2^28 in the first iteration of this loop. This is added to the
+		 * next limb and we can approximate the resulting bound of that limb by
+		 * 281*2^54. */
+		output[i] -= over << 26;
+		output[i + 1] += over;
+
+		/* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
+		 * 281*2^29. When this is added to the next limb, the resulting bound can
+		 * be approximated as 281*2^54.
+		 *
+		 * For subsequent iterations of the loop, 281*2^54 remains a conservative
+		 * bound and no overflow occurs. */
+		over = div_by_2_25 (output[i + 1]);
+		output[i + 1] -= over << 25;
+		output[i + 2] += over;
+	}
+	/* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
+	output[0] += output[10] << 4;
+	output[0] += output[10] << 1;
+	output[0] += output[10];
+
+	output[10] = 0;
+
+	/* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
+	 * So |over| will be no more than 2^16. */
+	{
+		limb over = div_by_2_26 (output[0]);
+		output[0] -= over << 26;
+		output[1] += over;
+	}
+
+	/* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
+	 * bound on |output[1]| is sufficient to meet our needs. */
+}
+
+/* A helpful wrapper around fproduct: output = in * in2.
+ *
+ * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
+ *
+ * output must be distinct to both inputs. The output is reduced degree
+ * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
+static void fmul (limb *output, const limb *in, const limb *in2)
+{
+	limb t[19];
+	fproduct (t, in, in2);
+	/* |t[i]| < 14*2^54 */
+	freduce_degree (t);
+	freduce_coefficients (t);
+	/* |t[i]| < 2^26 */
+	memcpy (output, t, sizeof(limb) * 10);
+}
+
+/* Square a number: output = in**2
+ *
+ * output must be distinct from the input. The inputs are reduced coefficient
+ * form, the output is not.
+ *
+ * output[x] <= 14 * the largest product of the input limbs. */
+static void fsquare_inner (limb *output, const limb *in)
+{
+	output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
+	output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
+	output[2] = 2
+			* (((limb) ((s32) in[1])) * ((s32) in[1])
+					+ ((limb) ((s32) in[0])) * ((s32) in[2]));
+	output[3] = 2
+			* (((limb) ((s32) in[1])) * ((s32) in[2])
+					+ ((limb) ((s32) in[0])) * ((s32) in[3]));
+	output[4] = ((limb) ((s32) in[2])) * ((s32) in[2])
+			+ 4 * ((limb) ((s32) in[1])) * ((s32) in[3])
+			+ 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
+	output[5] = 2
+			* (((limb) ((s32) in[2])) * ((s32) in[3])
+					+ ((limb) ((s32) in[1])) * ((s32) in[4])
+					+ ((limb) ((s32) in[0])) * ((s32) in[5]));
+	output[6] = 2
+			* (((limb) ((s32) in[3])) * ((s32) in[3])
+					+ ((limb) ((s32) in[2])) * ((s32) in[4])
+					+ ((limb) ((s32) in[0])) * ((s32) in[6])
+					+ 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
+	output[7] = 2
+			* (((limb) ((s32) in[3])) * ((s32) in[4])
+					+ ((limb) ((s32) in[2])) * ((s32) in[5])
+					+ ((limb) ((s32) in[1])) * ((s32) in[6])
+					+ ((limb) ((s32) in[0])) * ((s32) in[7]));
+	output[8] = ((limb) ((s32) in[4])) * ((s32) in[4])
+			+ 2
+					* (((limb) ((s32) in[2])) * ((s32) in[6])
+							+ ((limb) ((s32) in[0])) * ((s32) in[8])
+							+ 2
+									* (((limb) ((s32) in[1])) * ((s32) in[7])
+											+ ((limb) ((s32) in[3]))
+													* ((s32) in[5])));
+	output[9] = 2
+			* (((limb) ((s32) in[4])) * ((s32) in[5])
+					+ ((limb) ((s32) in[3])) * ((s32) in[6])
+					+ ((limb) ((s32) in[2])) * ((s32) in[7])
+					+ ((limb) ((s32) in[1])) * ((s32) in[8])
+					+ ((limb) ((s32) in[0])) * ((s32) in[9]));
+	output[10] = 2
+			* (((limb) ((s32) in[5])) * ((s32) in[5])
+					+ ((limb) ((s32) in[4])) * ((s32) in[6])
+					+ ((limb) ((s32) in[2])) * ((s32) in[8])
+					+ 2
+							* (((limb) ((s32) in[3])) * ((s32) in[7])
+									+ ((limb) ((s32) in[1])) * ((s32) in[9])));
+	output[11] = 2
+			* (((limb) ((s32) in[5])) * ((s32) in[6])
+					+ ((limb) ((s32) in[4])) * ((s32) in[7])
+					+ ((limb) ((s32) in[3])) * ((s32) in[8])
+					+ ((limb) ((s32) in[2])) * ((s32) in[9]));
+	output[12] = ((limb) ((s32) in[6])) * ((s32) in[6])
+			+ 2
+					* (((limb) ((s32) in[4])) * ((s32) in[8])
+							+ 2
+									* (((limb) ((s32) in[5])) * ((s32) in[7])
+											+ ((limb) ((s32) in[3]))
+													* ((s32) in[9])));
+	output[13] = 2
+			* (((limb) ((s32) in[6])) * ((s32) in[7])
+					+ ((limb) ((s32) in[5])) * ((s32) in[8])
+					+ ((limb) ((s32) in[4])) * ((s32) in[9]));
+	output[14] = 2
+			* (((limb) ((s32) in[7])) * ((s32) in[7])
+					+ ((limb) ((s32) in[6])) * ((s32) in[8])
+					+ 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
+	output[15] = 2
+			* (((limb) ((s32) in[7])) * ((s32) in[8])
+					+ ((limb) ((s32) in[6])) * ((s32) in[9]));
+	output[16] = ((limb) ((s32) in[8])) * ((s32) in[8])
+			+ 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
+	output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
+	output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
+}
+
+/* fsquare sets output = in^2.
+ *
+ * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
+ * 2^27.
+ *
+ * On exit: The |output| argument is in reduced coefficients form (indeed, one
+ * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
+static void fsquare (limb *output, const limb *in)
+{
+	limb t[19];
+	fsquare_inner (t, in);
+	/* |t[i]| < 14*2^54 because the largest product of two limbs will be <
+	 * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
+	 * products. */
+	freduce_degree (t);
+	freduce_coefficients (t);
+	/* |t[i]| < 2^26 */
+	memcpy (output, t, sizeof(limb) * 10);
+}
+
+/* Take a little-endian, 32-byte number and expand it into polynomial form */
+static void fexpand (limb *output, const u8 *input)
+{
+#define F(n,start,shift,mask) \
+  output[n] = ((((limb) input[start + 0]) | \
+                ((limb) input[start + 1]) << 8 | \
+                ((limb) input[start + 2]) << 16 | \
+                ((limb) input[start + 3]) << 24) >> shift) & mask;
+	F(0, 0, 0, 0x3ffffff);
+	F(1, 3, 2, 0x1ffffff);
+	F(2, 6, 3, 0x3ffffff);
+	F(3, 9, 5, 0x1ffffff);
+	F(4, 12, 6, 0x3ffffff);
+	F(5, 16, 0, 0x1ffffff);
+	F(6, 19, 1, 0x3ffffff);
+	F(7, 22, 3, 0x1ffffff);
+	F(8, 25, 4, 0x3ffffff);
+	F(9, 28, 6, 0x1ffffff);
+#undef F
+}
+
+#if (-32 >> 1) != -16
+#error "This code only works when >> does sign-extension on negative numbers"
+#endif
+
+/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
+static s32 s32_eq (s32 a, s32 b)
+{
+	a = ~(a ^ b);
+	a &= a << 16;
+	a &= a << 8;
+	a &= a << 4;
+	a &= a << 2;
+	a &= a << 1;
+	return a >> 31;
+}
+
+/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
+ * both non-negative. */
+static s32 s32_gte (s32 a, s32 b)
+{
+	a -= b;
+	/* a >= 0 iff a >= b. */
+	return ~(a >> 31);
+}
+
+/* Take a fully reduced polynomial form number and contract it into a
+ * little-endian, 32-byte array.
+ *
+ * On entry: |input_limbs[i]| < 2^26 */
+static void fcontract (u8 *output, limb *input_limbs)
+{
+	int i;
+	int j;
+	s32 input[10];
+	s32 mask;
+
+	/* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
+	for (i = 0; i < 10; i++) {
+		input[i] = input_limbs[i];
+	}
+
+	for (j = 0; j < 2; ++j) {
+		for (i = 0; i < 9; ++i) {
+			if ((i & 1) == 1) {
+				/* This calculation is a time-invariant way to make input[i]
+				 * non-negative by borrowing from the next-larger limb. */
+				const s32 mask = input[i] >> 31;
+				const s32 carry = -((input[i] & mask) >> 25);
+				input[i] = input[i] + (carry << 25);
+				input[i + 1] = input[i + 1] - carry;
+			}
+			else {
+				const s32 mask = input[i] >> 31;
+				const s32 carry = -((input[i] & mask) >> 26);
+				input[i] = input[i] + (carry << 26);
+				input[i + 1] = input[i + 1] - carry;
+			}
+		}
+
+		/* There's no greater limb for input[9] to borrow from, but we can multiply
+		 * by 19 and borrow from input[0], which is valid mod 2^255-19. */
+		{
+			const s32 mask = input[9] >> 31;
+			const s32 carry = -((input[9] & mask) >> 25);
+			input[9] = input[9] + (carry << 25);
+			input[0] = input[0] - (carry * 19);
+		}
+
+		/* After the first iteration, input[1..9] are non-negative and fit within
+		 * 25 or 26 bits, depending on position. However, input[0] may be
+		 * negative. */
+	}
+
+	/* The first borrow-propagation pass above ended with every limb
+	 except (possibly) input[0] non-negative.
+
+	 If input[0] was negative after the first pass, then it was because of a
+	 carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
+	 one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
+
+	 In the second pass, each limb is decreased by at most one. Thus the second
+	 borrow-propagation pass could only have wrapped around to decrease
+	 input[0] again if the first pass left input[0] negative *and* input[1]
+	 through input[9] were all zero.  In that case, input[1] is now 2^25 - 1,
+	 and this last borrow-propagation step will leave input[1] non-negative. */
+	{
+		const s32 mask = input[0] >> 31;
+		const s32 carry = -((input[0] & mask) >> 26);
+		input[0] = input[0] + (carry << 26);
+		input[1] = input[1] - carry;
+	}
+
+	/* All input[i] are now non-negative. However, there might be values between
+	 * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < 9; i++) {
+			if ((i & 1) == 1) {
+				const s32 carry = input[i] >> 25;
+				input[i] &= 0x1ffffff;
+				input[i + 1] += carry;
+			}
+			else {
+				const s32 carry = input[i] >> 26;
+				input[i] &= 0x3ffffff;
+				input[i + 1] += carry;
+			}
+		}
+
+		{
+			const s32 carry = input[9] >> 25;
+			input[9] &= 0x1ffffff;
+			input[0] += 19 * carry;
+		}
+	}
+
+	/* If the first carry-chain pass, just above, ended up with a carry from
+	 * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
+	 * < 2^26 + 2*19, because the carry was, at most, two.
+	 *
+	 * If the second pass carried from input[9] again then input[0] is < 2*19 and
+	 * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
+
+	/* It still remains the case that input might be between 2^255-19 and 2^255.
+	 * In this case, input[1..9] must take their maximum value and input[0] must
+	 * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
+	mask = s32_gte (input[0], 0x3ffffed);
+	for (i = 1; i < 10; i++) {
+		if ((i & 1) == 1) {
+			mask &= s32_eq (input[i], 0x1ffffff);
+		}
+		else {
+			mask &= s32_eq (input[i], 0x3ffffff);
+		}
+	}
+
+	/* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
+	 * this conditionally subtracts 2^255-19. */
+	input[0] -= mask & 0x3ffffed;
+
+	for (i = 1; i < 10; i++) {
+		if ((i & 1) == 1) {
+			input[i] -= mask & 0x1ffffff;
+		}
+		else {
+			input[i] -= mask & 0x3ffffff;
+		}
+	}
+
+	input[1] <<= 2;
+	input[2] <<= 3;
+	input[3] <<= 5;
+	input[4] <<= 6;
+	input[6] <<= 1;
+	input[7] <<= 3;
+	input[8] <<= 4;
+	input[9] <<= 6;
+#define F(i, s) \
+  output[s+0] |=  input[i] & 0xff; \
+  output[s+1]  = (input[i] >> 8) & 0xff; \
+  output[s+2]  = (input[i] >> 16) & 0xff; \
+  output[s+3]  = (input[i] >> 24) & 0xff;
+	output[0] = 0;
+	output[16] = 0;
+	F(0, 0);
+	F(1, 3);
+	F(2, 6);
+	F(3, 9);
+	F(4, 12);
+	F(5, 16);
+	F(6, 19);
+	F(7, 22);
+	F(8, 25);
+	F(9, 28);
+#undef F
+}
+
+/* Input: Q, Q', Q-Q'
+ * Output: 2Q, Q+Q'
+ *
+ *   x2 z3: long form
+ *   x3 z3: long form
+ *   x z: short form, destroyed
+ *   xprime zprime: short form, destroyed
+ *   qmqp: short form, preserved
+ *
+ * On entry and exit, the absolute value of the limbs of all inputs and outputs
+ * are < 2^26. */
+static void fmonty (limb *x2, limb *z2, /* output 2Q */
+limb *x3, limb *z3, /* output Q + Q' */
+limb *x, limb *z, /* input Q */
+limb *xprime, limb *zprime, /* input Q' */
+const limb *qmqp /* input Q - Q' */)
+{
+	limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
+			zzprime[19], zzzprime[19], xxxprime[19];
+
+	memcpy (origx, x, 10 * sizeof(limb));
+	fsum (x, z);
+	/* |x[i]| < 2^27 */
+	fdifference (z, origx); /* does x - z */
+	/* |z[i]| < 2^27 */
+
+	memcpy (origxprime, xprime, sizeof(limb) * 10);
+	fsum (xprime, zprime);
+	/* |xprime[i]| < 2^27 */
+	fdifference (zprime, origxprime);
+	/* |zprime[i]| < 2^27 */
+	fproduct (xxprime, xprime, z);
+	/* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
+	 * 2^(27+27) and fproduct adds together, at most, 14 of those products.
+	 * (Approximating that to 2^58 doesn't work out.) */
+	fproduct (zzprime, x, zprime);
+	/* |zzprime[i]| < 14*2^54 */
+	freduce_degree (xxprime);
+	freduce_coefficients (xxprime);
+	/* |xxprime[i]| < 2^26 */
+	freduce_degree (zzprime);
+	freduce_coefficients (zzprime);
+	/* |zzprime[i]| < 2^26 */
+	memcpy (origxprime, xxprime, sizeof(limb) * 10);
+	fsum (xxprime, zzprime);
+	/* |xxprime[i]| < 2^27 */
+	fdifference (zzprime, origxprime);
+	/* |zzprime[i]| < 2^27 */
+	fsquare (xxxprime, xxprime);
+	/* |xxxprime[i]| < 2^26 */
+	fsquare (zzzprime, zzprime);
+	/* |zzzprime[i]| < 2^26 */
+	fproduct (zzprime, zzzprime, qmqp);
+	/* |zzprime[i]| < 14*2^52 */
+	freduce_degree (zzprime);
+	freduce_coefficients (zzprime);
+	/* |zzprime[i]| < 2^26 */
+	memcpy (x3, xxxprime, sizeof(limb) * 10);
+	memcpy (z3, zzprime, sizeof(limb) * 10);
+
+	fsquare (xx, x);
+	/* |xx[i]| < 2^26 */
+	fsquare (zz, z);
+	/* |zz[i]| < 2^26 */
+	fproduct (x2, xx, zz);
+	/* |x2[i]| < 14*2^52 */
+	freduce_degree (x2);
+	freduce_coefficients (x2);
+	/* |x2[i]| < 2^26 */
+	fdifference (zz, xx);  // does zz = xx - zz
+	/* |zz[i]| < 2^27 */
+	memset (zzz + 10, 0, sizeof(limb) * 9);
+	fscalar_product (zzz, zz, 121665);
+	/* |zzz[i]| < 2^(27+17) */
+	/* No need to call freduce_degree here:
+	 fscalar_product doesn't increase the degree of its input. */
+	freduce_coefficients (zzz);
+	/* |zzz[i]| < 2^26 */
+	fsum (zzz, xx);
+	/* |zzz[i]| < 2^27 */
+	fproduct (z2, zz, zzz);
+	/* |z2[i]| < 14*2^(26+27) */
+	freduce_degree (z2);
+	freduce_coefficients (z2);
+	/* |z2|i| < 2^26 */
+}
+
+/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
+ * them unchanged if 'iswap' is 0.  Runs in data-invariant time to avoid
+ * side-channel attacks.
+ *
+ * NOTE that this function requires that 'iswap' be 1 or 0; other values give
+ * wrong results.  Also, the two limb arrays must be in reduced-coefficient,
+ * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
+ * and all all values in a[0..9],b[0..9] must have magnitude less than
+ * INT32_MAX. */
+static void swap_conditional (limb a[19], limb b[19], limb iswap)
+{
+	unsigned i;
+	const s32 swap = (s32) -iswap;
+
+	for (i = 0; i < 10; ++i) {
+		const s32 x = swap & (((s32) a[i]) ^ ((s32) b[i]));
+		a[i] = ((s32) a[i]) ^ x;
+		b[i] = ((s32) b[i]) ^ x;
+	}
+}
+
+/* Calculates nQ where Q is the x-coordinate of a point on the curve
+ *
+ *   resultx/resultz: the x coordinate of the resulting curve point (short form)
+ *   n: a little endian, 32-byte number
+ *   q: a point of the curve (short form) */
+static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q)
+{
+	limb a[19] = { 0 }, b[19] = { 1 }, c[19] = { 1 }, d[19] = { 0 };
+	limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
+	limb e[19] = { 0 }, f[19] = { 1 }, g[19] = { 0 }, h[19] = { 1 };
+	limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
+
+	unsigned i, j;
+
+	memcpy (nqpqx, q, sizeof(limb) * 10);
+
+	for (i = 0; i < 32; ++i) {
+		u8 byte = n[31 - i];
+		for (j = 0; j < 8; ++j) {
+			const limb bit = byte >> 7;
+
+			swap_conditional (nqx, nqpqx, bit);
+			swap_conditional (nqz, nqpqz, bit);
+			fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q);
+			swap_conditional (nqx2, nqpqx2, bit);
+			swap_conditional (nqz2, nqpqz2, bit);
+
+			t = nqx;
+			nqx = nqx2;
+			nqx2 = t;
+			t = nqz;
+			nqz = nqz2;
+			nqz2 = t;
+			t = nqpqx;
+			nqpqx = nqpqx2;
+			nqpqx2 = t;
+			t = nqpqz;
+			nqpqz = nqpqz2;
+			nqpqz2 = t;
+
+			byte <<= 1;
+		}
+	}
+
+	memcpy (resultx, nqx, sizeof(limb) * 10);
+	memcpy (resultz, nqz, sizeof(limb) * 10);
+}
+
+// -----------------------------------------------------------------------------
+// Shamelessly copied from djb's code
+// -----------------------------------------------------------------------------
+static void crecip (limb *out, const limb *z)
+{
+	limb z2[10];
+	limb z9[10];
+	limb z11[10];
+	limb z2_5_0[10];
+	limb z2_10_0[10];
+	limb z2_20_0[10];
+	limb z2_50_0[10];
+	limb z2_100_0[10];
+	limb t0[10];
+	limb t1[10];
+	int i;
+
+	/* 2 */fsquare (z2, z);
+	/* 4 */fsquare (t1, z2);
+	/* 8 */fsquare (t0, t1);
+	/* 9 */fmul (z9, t0, z);
+	/* 11 */fmul (z11, z9, z2);
+	/* 22 */fsquare (t0, z11);
+	/* 2^5 - 2^0 = 31 */fmul (z2_5_0, t0, z9);
+
+	/* 2^6 - 2^1 */fsquare (t0, z2_5_0);
+	/* 2^7 - 2^2 */fsquare (t1, t0);
+	/* 2^8 - 2^3 */fsquare (t0, t1);
+	/* 2^9 - 2^4 */fsquare (t1, t0);
+	/* 2^10 - 2^5 */fsquare (t0, t1);
+	/* 2^10 - 2^0 */fmul (z2_10_0, t0, z2_5_0);
+
+	/* 2^11 - 2^1 */fsquare (t0, z2_10_0);
+	/* 2^12 - 2^2 */fsquare (t1, t0);
+	/* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) {
+		fsquare (t0, t1);
+		fsquare (t1, t0);
+	}
+	/* 2^20 - 2^0 */fmul (z2_20_0, t1, z2_10_0);
+
+	/* 2^21 - 2^1 */fsquare (t0, z2_20_0);
+	/* 2^22 - 2^2 */fsquare (t1, t0);
+	/* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) {
+		fsquare (t0, t1);
+		fsquare (t1, t0);
+	}
+	/* 2^40 - 2^0 */fmul (t0, t1, z2_20_0);
+
+	/* 2^41 - 2^1 */fsquare (t1, t0);
+	/* 2^42 - 2^2 */fsquare (t0, t1);
+	/* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) {
+		fsquare (t1, t0);
+		fsquare (t0, t1);
+	}
+	/* 2^50 - 2^0 */fmul (z2_50_0, t0, z2_10_0);
+
+	/* 2^51 - 2^1 */fsquare (t0, z2_50_0);
+	/* 2^52 - 2^2 */fsquare (t1, t0);
+	/* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) {
+		fsquare (t0, t1);
+		fsquare (t1, t0);
+	}
+	/* 2^100 - 2^0 */fmul (z2_100_0, t1, z2_50_0);
+
+	/* 2^101 - 2^1 */fsquare (t1, z2_100_0);
+	/* 2^102 - 2^2 */fsquare (t0, t1);
+	/* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) {
+		fsquare (t1, t0);
+		fsquare (t0, t1);
+	}
+	/* 2^200 - 2^0 */fmul (t1, t0, z2_100_0);
+
+	/* 2^201 - 2^1 */fsquare (t0, t1);
+	/* 2^202 - 2^2 */fsquare (t1, t0);
+	/* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) {
+		fsquare (t0, t1);
+		fsquare (t1, t0);
+	}
+	/* 2^250 - 2^0 */fmul (t0, t1, z2_50_0);
+
+	/* 2^251 - 2^1 */fsquare (t1, t0);
+	/* 2^252 - 2^2 */fsquare (t0, t1);
+	/* 2^253 - 2^3 */fsquare (t1, t0);
+	/* 2^254 - 2^4 */fsquare (t0, t1);
+	/* 2^255 - 2^5 */fsquare (t1, t0);
+	/* 2^255 - 21 */fmul (out, t1, z11);
+}
+
+int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint)
+{
+	limb bp[10], x[10], z[11], zmone[10];
+	uint8_t e[32];
+	int i;
+
+	for (i = 0; i < 32; ++i)
+		e[i] = secret[i];
+	e[0] &= 248;
+	e[31] &= 127;
+	e[31] |= 64;
+
+	fexpand (bp, basepoint);
+	cmult (x, z, e, bp);
+	crecip (zmone, z);
+	fmul (z, x, zmone);
+	fcontract (mypublic, z);
+	return 0;
+}
diff --git a/src/libcryptobox/curve25519/curve25519.h b/src/libcryptobox/curve25519/curve25519.h
new file mode 100644
index 000000000..2d87e34cb
--- /dev/null
+++ b/src/libcryptobox/curve25519/curve25519.h
@@ -0,0 +1,10 @@
+#ifndef CURVE25519_H
+#define CURVE25519_H
+
+#include "config.h"
+
+static const guchar curve25519_basepoint[32] = {9};
+
+int curve25519 (guchar *mypublic, const guchar *secret, const guchar *basepoint);
+
+#endif
diff --git a/src/libcryptobox/curve25519/ref.c b/src/libcryptobox/curve25519/ref.c
new file mode 100644
index 000000000..ac3827c27
--- /dev/null
+++ b/src/libcryptobox/curve25519/ref.c
@@ -0,0 +1,323 @@
+/*
+version 20081011
+Matthew Dempsky
+Public domain.
+Derived from public domain code by D. J. Bernstein.
+20140216 tweak: Mask top bit of point input.
+*/
+
+static void add (unsigned int out[32], const unsigned int a[32],
+		const unsigned int b[32])
+{
+	unsigned int j;
+	unsigned int u;
+	u = 0;
+	for (j = 0; j < 31; ++j) {
+		u += a[j] + b[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+	u += a[31] + b[31];
+	out[31] = u;
+}
+
+static void sub (unsigned int out[32], const unsigned int a[32],
+		const unsigned int b[32])
+{
+	unsigned int j;
+	unsigned int u;
+	u = 218;
+	for (j = 0; j < 31; ++j) {
+		u += a[j] + 65280 - b[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+	u += a[31] - b[31];
+	out[31] = u;
+}
+
+static void squeeze (unsigned int a[32])
+{
+	unsigned int j;
+	unsigned int u;
+	u = 0;
+	for (j = 0; j < 31; ++j) {
+		u += a[j];
+		a[j] = u & 255;
+		u >>= 8;
+	}
+	u += a[31];
+	a[31] = u & 127;
+	u = 19 * (u >> 7);
+	for (j = 0; j < 31; ++j) {
+		u += a[j];
+		a[j] = u & 255;
+		u >>= 8;
+	}
+	u += a[31];
+	a[31] = u;
+}
+
+static const unsigned int minusp[32] = { 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
+
+static void freeze (unsigned int a[32])
+{
+	unsigned int aorig[32];
+	unsigned int j;
+	unsigned int negative;
+
+	for (j = 0; j < 32; ++j)
+		aorig[j] = a[j];
+	add (a, a, minusp);
+	negative = -((a[31] >> 7) & 1);
+	for (j = 0; j < 32; ++j)
+		a[j] ^= negative & (aorig[j] ^ a[j]);
+}
+
+static void mult (unsigned int out[32], const unsigned int a[32],
+		const unsigned int b[32])
+{
+	unsigned int i;
+	unsigned int j;
+	unsigned int u;
+
+	for (i = 0; i < 32; ++i) {
+		u = 0;
+		for (j = 0; j <= i; ++j)
+			u += a[j] * b[i - j];
+		for (j = i + 1; j < 32; ++j)
+			u += 38 * a[j] * b[i + 32 - j];
+		out[i] = u;
+	}
+	squeeze (out);
+}
+
+static void mult121665 (unsigned int out[32], const unsigned int a[32])
+{
+	unsigned int j;
+	unsigned int u;
+
+	u = 0;
+	for (j = 0; j < 31; ++j) {
+		u += 121665 * a[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+	u += 121665 * a[31];
+	out[31] = u & 127;
+	u = 19 * (u >> 7);
+	for (j = 0; j < 31; ++j) {
+		u += out[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+	u += out[j];
+	out[j] = u;
+}
+
+static void square (unsigned int out[32], const unsigned int a[32])
+{
+	unsigned int i;
+	unsigned int j;
+	unsigned int u;
+
+	for (i = 0; i < 32; ++i) {
+		u = 0;
+		for (j = 0; j < i - j; ++j)
+			u += a[j] * a[i - j];
+		for (j = i + 1; j < i + 32 - j; ++j)
+			u += 38 * a[j] * a[i + 32 - j];
+		u *= 2;
+		if ((i & 1) == 0) {
+			u += a[i / 2] * a[i / 2];
+			u += 38 * a[i / 2 + 16] * a[i / 2 + 16];
+		}
+		out[i] = u;
+	}
+	squeeze (out);
+}
+
+static void select (unsigned int p[64], unsigned int q[64],
+		const unsigned int r[64], const unsigned int s[64], unsigned int b)
+{
+	unsigned int j;
+	unsigned int t;
+	unsigned int bminus1;
+
+	bminus1 = b - 1;
+	for (j = 0; j < 64; ++j) {
+		t = bminus1 & (r[j] ^ s[j]);
+		p[j] = s[j] ^ t;
+		q[j] = r[j] ^ t;
+	}
+}
+
+static void mainloop (unsigned int work[64], const unsigned char e[32])
+{
+	unsigned int xzm1[64];
+	unsigned int xzm[64];
+	unsigned int xzmb[64];
+	unsigned int xzm1b[64];
+	unsigned int xznb[64];
+	unsigned int xzn1b[64];
+	unsigned int a0[64];
+	unsigned int a1[64];
+	unsigned int b0[64];
+	unsigned int b1[64];
+	unsigned int c1[64];
+	unsigned int r[32];
+	unsigned int s[32];
+	unsigned int t[32];
+	unsigned int u[32];
+	unsigned int i;
+	unsigned int j;
+	unsigned int b;
+	int pos;
+
+	for (j = 0; j < 32; ++j)
+		xzm1[j] = work[j];
+	xzm1[32] = 1;
+	for (j = 33; j < 64; ++j)
+		xzm1[j] = 0;
+
+	xzm[0] = 1;
+	for (j = 1; j < 64; ++j)
+		xzm[j] = 0;
+
+	for (pos = 254; pos >= 0; --pos) {
+		b = e[pos / 8] >> (pos & 7);
+		b &= 1;
+		select (xzmb, xzm1b, xzm, xzm1, b);
+		add (a0, xzmb, xzmb + 32);
+		sub (a0 + 32, xzmb, xzmb + 32);
+		add (a1, xzm1b, xzm1b + 32);
+		sub (a1 + 32, xzm1b, xzm1b + 32);
+		square (b0, a0);
+		square (b0 + 32, a0 + 32);
+		mult (b1, a1, a0 + 32);
+		mult (b1 + 32, a1 + 32, a0);
+		add (c1, b1, b1 + 32);
+		sub (c1 + 32, b1, b1 + 32);
+		square (r, c1 + 32);
+		sub (s, b0, b0 + 32);
+		mult121665 (t, s);
+		add (u, t, b0);
+		mult (xznb, b0, b0 + 32);
+		mult (xznb + 32, s, u);
+		square (xzn1b, c1);
+		mult (xzn1b + 32, r, work);
+		select (xzm, xzm1, xznb, xzn1b, b);
+	}
+
+	for (j = 0; j < 64; ++j)
+		work[j] = xzm[j];
+}
+
+static void recip (unsigned int out[32], const unsigned int z[32])
+{
+	unsigned int z2[32];
+	unsigned int z9[32];
+	unsigned int z11[32];
+	unsigned int z2_5_0[32];
+	unsigned int z2_10_0[32];
+	unsigned int z2_20_0[32];
+	unsigned int z2_50_0[32];
+	unsigned int z2_100_0[32];
+	unsigned int t0[32];
+	unsigned int t1[32];
+	int i;
+
+	/* 2 */square (z2, z);
+	/* 4 */square (t1, z2);
+	/* 8 */square (t0, t1);
+	/* 9 */mult (z9, t0, z);
+	/* 11 */mult (z11, z9, z2);
+	/* 22 */square (t0, z11);
+	/* 2^5 - 2^0 = 31 */mult (z2_5_0, t0, z9);
+
+	/* 2^6 - 2^1 */square (t0, z2_5_0);
+	/* 2^7 - 2^2 */square (t1, t0);
+	/* 2^8 - 2^3 */square (t0, t1);
+	/* 2^9 - 2^4 */square (t1, t0);
+	/* 2^10 - 2^5 */square (t0, t1);
+	/* 2^10 - 2^0 */mult (z2_10_0, t0, z2_5_0);
+
+	/* 2^11 - 2^1 */square (t0, z2_10_0);
+	/* 2^12 - 2^2 */square (t1, t0);
+	/* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) {
+		square (t0, t1);
+		square (t1, t0);
+	}
+	/* 2^20 - 2^0 */mult (z2_20_0, t1, z2_10_0);
+
+	/* 2^21 - 2^1 */square (t0, z2_20_0);
+	/* 2^22 - 2^2 */square (t1, t0);
+	/* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) {
+		square (t0, t1);
+		square (t1, t0);
+	}
+	/* 2^40 - 2^0 */mult (t0, t1, z2_20_0);
+
+	/* 2^41 - 2^1 */square (t1, t0);
+	/* 2^42 - 2^2 */square (t0, t1);
+	/* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) {
+		square (t1, t0);
+		square (t0, t1);
+	}
+	/* 2^50 - 2^0 */mult (z2_50_0, t0, z2_10_0);
+
+	/* 2^51 - 2^1 */square (t0, z2_50_0);
+	/* 2^52 - 2^2 */square (t1, t0);
+	/* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) {
+		square (t0, t1);
+		square (t1, t0);
+	}
+	/* 2^100 - 2^0 */mult (z2_100_0, t1, z2_50_0);
+
+	/* 2^101 - 2^1 */square (t1, z2_100_0);
+	/* 2^102 - 2^2 */square (t0, t1);
+	/* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) {
+		square (t1, t0);
+		square (t0, t1);
+	}
+	/* 2^200 - 2^0 */mult (t1, t0, z2_100_0);
+
+	/* 2^201 - 2^1 */square (t0, t1);
+	/* 2^202 - 2^2 */square (t1, t0);
+	/* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) {
+		square (t0, t1);
+		square (t1, t0);
+	}
+	/* 2^250 - 2^0 */mult (t0, t1, z2_50_0);
+
+	/* 2^251 - 2^1 */square (t1, t0);
+	/* 2^252 - 2^2 */square (t0, t1);
+	/* 2^253 - 2^3 */square (t1, t0);
+	/* 2^254 - 2^4 */square (t0, t1);
+	/* 2^255 - 2^5 */square (t1, t0);
+	/* 2^255 - 21 */mult (out, t1, z11);
+}
+
+int curve25519 (unsigned char *q, const unsigned char *n,
+		const unsigned char *p)
+{
+	unsigned int work[96];
+	unsigned char e[32];
+	unsigned int i;
+	for (i = 0; i < 32; ++i)
+		e[i] = n[i];
+	e[0] &= 248;
+	e[31] &= 127;
+	e[31] |= 64;
+	for (i = 0; i < 32; ++i)
+		work[i] = p[i];
+	work[31] &= 127;
+	mainloop (work, e);
+	recip (work + 32, work + 32);
+	mult (work + 64, work, work + 32);
+	freeze (work + 64);
+	for (i = 0; i < 32; ++i)
+		q[i] = work[64 + i];
+	return 0;
+}
diff --git a/src/libcryptobox/poly1305/README.md b/src/libcryptobox/poly1305/README.md
new file mode 100644
index 000000000..510e0f781
--- /dev/null
+++ b/src/libcryptobox/poly1305/README.md
@@ -0,0 +1,112 @@
+"A state-of-the-art message-authentication code"
+
+# ABOUT
+
+See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)
+
+These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.
+Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)
+
+# BUILDING
+
+## Default
+
+If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based
+on what it can tell the compiler supports
+
+    gcc poly1305-donna.c -O3 -o poly1305.o
+
+## Selecting a specific version
+
+    gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS
+
+Where `-DPOLY1305_XXBITS` is one of
+
+ * `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions
+ * `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions
+ * `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions
+ * `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions
+
+8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due
+to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which 
+is not the intention of this project.
+
+# USAGE
+
+See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:
+
+The poly1305_auth function, viewed as a function of the message for a uniform random key, is 
+designed to meet the standard notion of unforgeability after a single message. After the sender 
+authenticates one message, an attacker cannot find authenticators for any other messages.
+
+The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.
+Authenticators for two messages under the same key should be expected to reveal enough information 
+to allow forgeries of authenticators on other messages. 
+
+## Functions
+
+`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support 
+every underlying platform specific implementation. It should be size_t aligned, which should be handled already
+with the size_t member `aligner`.
+
+`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`
+
+where
+
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**
+
+`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`
+
+where `m` is a pointer to the message fragment to be processed, and
+
+`bytes` is the length of the message fragment
+
+`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`
+
+where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying
+implementation will zero out `ctx`.
+
+`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`
+
+where `mac` is the buffer which receives the 16 byte authenticator,
+
+`m` is a pointer to the message to be processed,
+
+`bytes` is the number of bytes in the message, and
+
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.
+
+`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`
+
+where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not
+
+`int poly1305_power_on_self_test(void);`
+
+tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if 
+any tests fail.
+
+## Example
+
+### Simple
+
+    #include "poly1305-donna.h"
+
+    unsigned char key[32] = {...}, mac[16];
+    unsigned char msg[] = {...};
+
+    poly1305_auth(mac, msg, msglen, key);
+
+### Full
+
+[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing
+the correct results, compute an authenticator, and test it against an expected value.
+
+# LICENSE
+
+[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN
+
+
+# NAMESAKE
+
+I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence
+the name.
+\ No newline at end of file
diff --git a/src/libcryptobox/poly1305/poly1305-donna-16.h b/src/libcryptobox/poly1305/poly1305-donna-16.h
new file mode 100644
index 000000000..5e5c6d3b6
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna-16.h
@@ -0,0 +1,202 @@
+/*
+	poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition
+*/
+
+#if defined(_MSC_VER)
+	#define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+	#define POLY1305_NOINLINE __attribute__((noinline))
+#else
+	#define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */
+typedef struct poly1305_state_internal_t {
+	unsigned char buffer[poly1305_block_size];
+	size_t leftover;
+	unsigned short r[10];
+	unsigned short h[10];
+	unsigned short pad[8];
+	unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */
+static unsigned short
+U8TO16(const unsigned char *p) {
+	return
+		(((unsigned short)(p[0] & 0xff)      ) |
+	     ((unsigned short)(p[1] & 0xff) <<  8));
+}
+
+/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */
+static void
+U16TO8(unsigned char *p, unsigned short v) {
+	p[0] = (v      ) & 0xff;
+	p[1] = (v >>  8) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
+	size_t i;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	t0 = U8TO16(&key[ 0]); st->r[0] = ( t0                    ) & 0x1fff;
+	t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 <<  3)) & 0x1fff;
+	t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 <<  6)) & 0x1f03;
+	t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >>  7) | (t3 <<  9)) & 0x1fff;
+	t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >>  4) | (t4 << 12)) & 0x00ff;
+	                       st->r[5] = ((t4 >>  1)             ) & 0x1ffe;
+	t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 <<  2)) & 0x1fff;
+	t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 <<  5)) & 0x1f81;
+	t7 = U8TO16(&key[14]); st->r[8] = ((t6 >>  8) | (t7 <<  8)) & 0x1fff;
+	                       st->r[9] = ((t7 >>  5)             ) & 0x007f;
+
+	/* h = 0 */
+	for (i = 0; i < 10; i++)
+		st->h[i] = 0;
+
+	/* save pad for later */
+	for (i = 0; i < 8; i++)
+		st->pad[i] = U8TO16(&key[16 + (2 * i)]);
+
+	st->leftover = 0;
+	st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+	const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */
+	unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
+	unsigned long d[10];
+	unsigned long c;
+
+	while (bytes >= poly1305_block_size) {
+		size_t i, j;
+
+		/* h += m[i] */
+		t0 = U8TO16(&m[ 0]); st->h[0] += ( t0                    ) & 0x1fff;
+		t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 <<  3)) & 0x1fff;
+		t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 <<  6)) & 0x1fff;
+		t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >>  7) | (t3 <<  9)) & 0x1fff;
+		t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >>  4) | (t4 << 12)) & 0x1fff;
+		                     st->h[5] += ((t4 >>  1)             ) & 0x1fff;
+		t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 <<  2)) & 0x1fff;
+		t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 <<  5)) & 0x1fff;
+		t7 = U8TO16(&m[14]); st->h[8] += ((t6 >>  8) | (t7 <<  8)) & 0x1fff;
+		                     st->h[9] += ((t7 >>  5)             ) | hibit;
+
+		/* h *= r, (partial) h %= p */
+		for (i = 0, c = 0; i < 10; i++) {
+			d[i] = c;
+			for (j = 0; j < 10; j++) {
+				d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j]));
+				/* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */
+				if (j == 4) {
+					c = (d[i] >> 13);
+					d[i] &= 0x1fff;
+				}
+			}
+			c += (d[i] >> 13);
+			d[i] &= 0x1fff;
+		}
+		c = ((c << 2) + c); /* c *= 5 */
+		c += d[0];
+		d[0] = ((unsigned short)c & 0x1fff);
+		c = (c >> 13);
+		d[1] += c;
+
+		for (i = 0; i < 10; i++)
+			st->h[i] = (unsigned short)d[i];
+
+		m += poly1305_block_size;
+		bytes -= poly1305_block_size;
+	}
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	unsigned short c;
+	unsigned short g[10];
+	unsigned short mask;
+	unsigned long f;
+	size_t i;
+
+	/* process the remaining block */
+	if (st->leftover) {
+		size_t i = st->leftover;
+		st->buffer[i++] = 1;
+		for (; i < poly1305_block_size; i++)
+			st->buffer[i] = 0;
+		st->final = 1;
+		poly1305_blocks(st, st->buffer, poly1305_block_size);
+	}
+
+	/* fully carry h */
+	c = st->h[1] >> 13;
+	st->h[1] &= 0x1fff;
+	for (i = 2; i < 10; i++) {
+		st->h[i] += c;
+		c = st->h[i] >> 13;
+		st->h[i] &= 0x1fff;
+	}
+	st->h[0] += (c * 5);
+	c = st->h[0] >> 13;
+	st->h[0] &= 0x1fff;
+	st->h[1] += c;
+	c = st->h[1] >> 13;
+	st->h[1] &= 0x1fff;
+	st->h[2] += c;
+
+	/* compute h + -p */
+	g[0] = st->h[0] + 5;
+	c = g[0] >> 13;
+	g[0] &= 0x1fff;
+	for (i = 1; i < 10; i++) {
+		g[i] = st->h[i] + c;
+		c = g[i] >> 13;
+		g[i] &= 0x1fff;
+	}
+	g[9] -= (1 << 13);
+
+	/* select h if h < p, or h + -p if h >= p */
+	mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1;
+	for (i = 0; i < 10; i++)
+		g[i] &= mask;
+	mask = ~mask;
+	for (i = 0; i < 10; i++)
+		st->h[i] = (st->h[i] & mask) | g[i];
+
+	/* h = h % (2^128) */
+	st->h[0] = ((st->h[0]      ) | (st->h[1] << 13)                   ) & 0xffff;
+	st->h[1] = ((st->h[1] >>  3) | (st->h[2] << 10)                   ) & 0xffff;
+	st->h[2] = ((st->h[2] >>  6) | (st->h[3] <<  7)                   ) & 0xffff;
+	st->h[3] = ((st->h[3] >>  9) | (st->h[4] <<  4)                   ) & 0xffff;
+	st->h[4] = ((st->h[4] >> 12) | (st->h[5] <<  1) | (st->h[6] << 14)) & 0xffff;
+	st->h[5] = ((st->h[6] >>  2) | (st->h[7] << 11)                   ) & 0xffff;
+	st->h[6] = ((st->h[7] >>  5) | (st->h[8] <<  8)                   ) & 0xffff;
+	st->h[7] = ((st->h[8] >>  8) | (st->h[9] <<  5)                   ) & 0xffff;
+
+	/* mac = (h + pad) % (2^128) */
+	f = (unsigned long)st->h[0] + st->pad[0];
+	st->h[0] = (unsigned short)f;
+	for (i = 1; i < 8; i++) {
+		f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16);
+		st->h[i] = (unsigned short)f;
+	}
+
+	for (i = 0; i < 8; i++)
+		U16TO8(mac + (i * 2), st->h[i]);
+
+	/* zero out the state */
+	for (i = 0; i < 10; i++)
+		st->h[i] = 0;
+	for (i = 0; i < 10; i++)
+		st->r[i] = 0;
+	for (i = 0; i < 8; i++)
+		st->pad[i] = 0;
+}
diff --git a/src/libcryptobox/poly1305/poly1305-donna-32.h b/src/libcryptobox/poly1305/poly1305-donna-32.h
new file mode 100644
index 000000000..c45aab33a
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna-32.h
@@ -0,0 +1,219 @@
+/*
+	poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
+*/
+
+#if defined(_MSC_VER)
+	#define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+	#define POLY1305_NOINLINE __attribute__((noinline))
+#else
+	#define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */
+typedef struct poly1305_state_internal_t {
+	unsigned long r[5];
+	unsigned long h[5];
+	unsigned long pad[4];
+	size_t leftover;
+	unsigned char buffer[poly1305_block_size];
+	unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
+static unsigned long
+U8TO32(const unsigned char *p) {
+	return
+		(((unsigned long)(p[0] & 0xff)      ) |
+	     ((unsigned long)(p[1] & 0xff) <<  8) |
+         ((unsigned long)(p[2] & 0xff) << 16) |
+         ((unsigned long)(p[3] & 0xff) << 24));
+}
+
+/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
+static void
+U32TO8(unsigned char *p, unsigned long v) {
+	p[0] = (v      ) & 0xff;
+	p[1] = (v >>  8) & 0xff;
+	p[2] = (v >> 16) & 0xff;
+	p[3] = (v >> 24) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	st->r[0] = (U8TO32(&key[ 0])     ) & 0x3ffffff;
+	st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03;
+	st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff;
+	st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff;
+	st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff;
+
+	/* h = 0 */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+	st->h[3] = 0;
+	st->h[4] = 0;
+
+	/* save pad for later */
+	st->pad[0] = U8TO32(&key[16]);
+	st->pad[1] = U8TO32(&key[20]);
+	st->pad[2] = U8TO32(&key[24]);
+	st->pad[3] = U8TO32(&key[28]);
+
+	st->leftover = 0;
+	st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+	const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
+	unsigned long r0,r1,r2,r3,r4;
+	unsigned long s1,s2,s3,s4;
+	unsigned long h0,h1,h2,h3,h4;
+	unsigned long long d0,d1,d2,d3,d4;
+	unsigned long c;
+
+	r0 = st->r[0];
+	r1 = st->r[1];
+	r2 = st->r[2];
+	r3 = st->r[3];
+	r4 = st->r[4];
+
+	s1 = r1 * 5;
+	s2 = r2 * 5;
+	s3 = r3 * 5;
+	s4 = r4 * 5;
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	while (bytes >= poly1305_block_size) {
+		/* h += m[i] */
+		h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
+		h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
+		h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
+		h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
+		h4 += (U8TO32(m+12) >> 8) | hibit;
+
+		/* h *= r */
+		d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1);
+		d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2);
+		d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3);
+		d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4);
+		d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0);
+
+		/* (partial) h %= p */
+		              c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff;
+		d1 += c;      c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff;
+		d2 += c;      c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff;
+		d3 += c;      c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff;
+		d4 += c;      c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff;
+		h0 += c * 5;  c =                (h0 >> 26); h0 =                h0 & 0x3ffffff;
+		h1 += c;
+
+		m += poly1305_block_size;
+		bytes -= poly1305_block_size;
+	}
+
+	st->h[0] = h0;
+	st->h[1] = h1;
+	st->h[2] = h2;
+	st->h[3] = h3;
+	st->h[4] = h4;
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	unsigned long h0,h1,h2,h3,h4,c;
+	unsigned long g0,g1,g2,g3,g4;
+	unsigned long long f;
+	unsigned long mask;
+
+	/* process the remaining block */
+	if (st->leftover) {
+		size_t i = st->leftover;
+		st->buffer[i++] = 1;
+		for (; i < poly1305_block_size; i++)
+			st->buffer[i] = 0;
+		st->final = 1;
+		poly1305_blocks(st, st->buffer, poly1305_block_size);
+	}
+
+	/* fully carry h */
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	             c = h1 >> 26; h1 = h1 & 0x3ffffff;
+	h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
+	h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
+	h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
+	h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
+	h1 +=     c;
+
+	/* compute h + -p */
+	g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
+	g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
+	g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
+	g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
+	g4 = h4 + c - (1 << 26);
+
+	/* select h if h < p, or h + -p if h >= p */
+	mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
+	g0 &= mask;
+	g1 &= mask;
+	g2 &= mask;
+	g3 &= mask;
+	g4 &= mask;
+	mask = ~mask;
+	h0 = (h0 & mask) | g0;
+	h1 = (h1 & mask) | g1;
+	h2 = (h2 & mask) | g2;
+	h3 = (h3 & mask) | g3;
+	h4 = (h4 & mask) | g4;
+
+	/* h = h % (2^128) */
+	h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
+	h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
+	h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+	h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
+
+	/* mac = (h + pad) % (2^128) */
+	f = (unsigned long long)h0 + st->pad[0]            ; h0 = (unsigned long)f;
+	f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f;
+	f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f;
+	f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f;
+
+	U32TO8(mac +  0, h0);
+	U32TO8(mac +  4, h1);
+	U32TO8(mac +  8, h2);
+	U32TO8(mac + 12, h3);
+
+	/* zero out the state */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+	st->h[3] = 0;
+	st->h[4] = 0;
+	st->r[0] = 0;
+	st->r[1] = 0;
+	st->r[2] = 0;
+	st->r[3] = 0;
+	st->r[4] = 0;
+	st->pad[0] = 0;
+	st->pad[1] = 0;
+	st->pad[2] = 0;
+	st->pad[3] = 0;
+}
+
diff --git a/src/libcryptobox/poly1305/poly1305-donna-64.h b/src/libcryptobox/poly1305/poly1305-donna-64.h
new file mode 100644
index 000000000..016f5b384
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna-64.h
@@ -0,0 +1,224 @@
+/*
+	poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
+*/
+
+#if defined(_MSC_VER)
+	#include <intrin.h>
+
+	typedef struct uint128_t {
+		unsigned long long lo;
+		unsigned long long hi;
+	} uint128_t;
+
+	#define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
+	#define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; }
+	#define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); }
+	#define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
+	#define LO(in) (in.lo)
+
+	#define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+	#if defined(__SIZEOF_INT128__)
+		typedef unsigned __int128 uint128_t;
+	#else
+		typedef unsigned uint128_t __attribute__((mode(TI)));
+	#endif
+
+	#define MUL(out, x, y) out = ((uint128_t)x * y)
+	#define ADD(out, in) out += in
+	#define ADDLO(out, in) out += in
+	#define SHR(in, shift) (unsigned long long)(in >> (shift))
+	#define LO(in) (unsigned long long)(in)
+
+	#define POLY1305_NOINLINE __attribute__((noinline))
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */
+typedef struct poly1305_state_internal_t {
+	unsigned long long r[3];
+	unsigned long long h[3];
+	unsigned long long pad[2];
+	size_t leftover;
+	unsigned char buffer[poly1305_block_size];
+	unsigned char final;
+} poly1305_state_internal_t;
+
+/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
+static unsigned long long
+U8TO64(const unsigned char *p) {
+	return
+		(((unsigned long long)(p[0] & 0xff)      ) |
+		 ((unsigned long long)(p[1] & 0xff) <<  8) |
+		 ((unsigned long long)(p[2] & 0xff) << 16) |
+		 ((unsigned long long)(p[3] & 0xff) << 24) |
+		 ((unsigned long long)(p[4] & 0xff) << 32) |
+		 ((unsigned long long)(p[5] & 0xff) << 40) |
+		 ((unsigned long long)(p[6] & 0xff) << 48) |
+		 ((unsigned long long)(p[7] & 0xff) << 56));
+}
+
+/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
+static void
+U64TO8(unsigned char *p, unsigned long long v) {
+	p[0] = (v      ) & 0xff;
+	p[1] = (v >>  8) & 0xff;
+	p[2] = (v >> 16) & 0xff;
+	p[3] = (v >> 24) & 0xff;
+	p[4] = (v >> 32) & 0xff;
+	p[5] = (v >> 40) & 0xff;
+	p[6] = (v >> 48) & 0xff;
+	p[7] = (v >> 56) & 0xff;
+}
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	unsigned long long t0,t1;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	t0 = U8TO64(&key[0]);
+	t1 = U8TO64(&key[8]);
+
+	st->r[0] = ( t0                    ) & 0xffc0fffffff;
+	st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+	st->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
+
+	/* h = 0 */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+
+	/* save pad for later */
+	st->pad[0] = U8TO64(&key[16]);
+	st->pad[1] = U8TO64(&key[24]);
+
+	st->leftover = 0;
+	st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+	const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */
+	unsigned long long r0,r1,r2;
+	unsigned long long s1,s2;
+	unsigned long long h0,h1,h2;
+	unsigned long long c;
+	uint128_t d0,d1,d2,d;
+
+	r0 = st->r[0];
+	r1 = st->r[1];
+	r2 = st->r[2];
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+
+	s1 = r1 * (5 << 2);
+	s2 = r2 * (5 << 2);
+
+	while (bytes >= poly1305_block_size) {
+		unsigned long long t0,t1;
+
+		/* h += m[i] */
+		t0 = U8TO64(&m[0]);
+		t1 = U8TO64(&m[8]);
+
+		h0 += (( t0                    ) & 0xfffffffffff);
+		h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
+		h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
+
+		/* h *= r */
+		MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
+		MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
+		MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
+
+		/* (partial) h %= p */
+		              c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
+		ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
+		ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
+		h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
+		h1  += c;
+
+		m += poly1305_block_size;
+		bytes -= poly1305_block_size;
+	}
+
+	st->h[0] = h0;
+	st->h[1] = h1;
+	st->h[2] = h2;
+}
+
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	unsigned long long h0,h1,h2,c;
+	unsigned long long g0,g1,g2;
+	unsigned long long t0,t1;
+
+	/* process the remaining block */
+	if (st->leftover) {
+		size_t i = st->leftover;
+		st->buffer[i] = 1;
+		for (i = i + 1; i < poly1305_block_size; i++)
+			st->buffer[i] = 0;
+		st->final = 1;
+		poly1305_blocks(st, st->buffer, poly1305_block_size);
+	}
+
+	/* fully carry h */
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+
+	             c = (h1 >> 44); h1 &= 0xfffffffffff;
+	h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
+	h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+	h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
+	h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
+	h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+	h1 += c;
+
+	/* compute h + -p */
+	g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
+	g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
+	g2 = h2 + c - ((unsigned long long)1 << 42);
+
+	/* select h if h < p, or h + -p if h >= p */
+	c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
+	g0 &= c;
+	g1 &= c;
+	g2 &= c;
+	c = ~c;
+	h0 = (h0 & c) | g0;
+	h1 = (h1 & c) | g1;
+	h2 = (h2 & c) | g2;
+
+	/* h = (h + pad) */
+	t0 = st->pad[0];
+	t1 = st->pad[1];
+
+	h0 += (( t0                    ) & 0xfffffffffff)    ; c = (h0 >> 44); h0 &= 0xfffffffffff;
+	h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+	h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;                 h2 &= 0x3ffffffffff;
+
+	/* mac = h % (2^128) */
+	h0 = ((h0      ) | (h1 << 44));
+	h1 = ((h1 >> 20) | (h2 << 24));
+
+	U64TO8(&mac[0], h0);
+	U64TO8(&mac[8], h1);
+
+	/* zero out the state */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+	st->r[0] = 0;
+	st->r[1] = 0;
+	st->r[2] = 0;
+	st->pad[0] = 0;
+	st->pad[1] = 0;
+}
+
diff --git a/src/libcryptobox/poly1305/poly1305-donna-8.h b/src/libcryptobox/poly1305/poly1305-donna-8.h
new file mode 100644
index 000000000..ac5d5aeb5
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna-8.h
@@ -0,0 +1,186 @@
+/*
+	poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition
+
+	based on the public domain reference version in supercop by djb
+*/
+
+#if defined(_MSC_VER)
+	#define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+	#define POLY1305_NOINLINE __attribute__((noinline))
+#else
+	#define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */
+typedef struct poly1305_state_internal_t {
+	unsigned char buffer[poly1305_block_size];
+	size_t leftover;
+	unsigned char h[17];
+	unsigned char r[17];
+	unsigned char pad[17];
+	unsigned char final;
+} poly1305_state_internal_t;
+
+void
+poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	size_t i;
+
+	st->leftover = 0;
+
+	/* h = 0 */
+	for (i = 0; i < 17; i++)
+		st->h[i] = 0;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	st->r[ 0] = key[ 0] & 0xff;
+	st->r[ 1] = key[ 1] & 0xff;
+	st->r[ 2] = key[ 2] & 0xff;
+	st->r[ 3] = key[ 3] & 0x0f;
+	st->r[ 4] = key[ 4] & 0xfc;
+	st->r[ 5] = key[ 5] & 0xff;
+	st->r[ 6] = key[ 6] & 0xff;
+	st->r[ 7] = key[ 7] & 0x0f;
+	st->r[ 8] = key[ 8] & 0xfc;
+	st->r[ 9] = key[ 9] & 0xff;
+	st->r[10] = key[10] & 0xff;
+	st->r[11] = key[11] & 0x0f;
+	st->r[12] = key[12] & 0xfc;
+	st->r[13] = key[13] & 0xff;
+	st->r[14] = key[14] & 0xff;
+	st->r[15] = key[15] & 0x0f;
+	st->r[16] = 0;
+
+	/* save pad for later */
+	for (i = 0; i < 16; i++)
+		st->pad[i] = key[i + 16];
+	st->pad[16] = 0;
+
+	st->final = 0;
+}
+
+static void
+poly1305_add(unsigned char h[17], const unsigned char c[17]) {
+	unsigned short u;
+	unsigned int i;
+	for (u = 0, i = 0; i < 17; i++) {
+		u += (unsigned short)h[i] + (unsigned short)c[i];
+		h[i] = (unsigned char)u & 0xff;
+		u >>= 8;
+	}
+}
+
+static void
+poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) {
+	unsigned long u;
+	unsigned int i;
+	u = 0;
+	for (i = 0; i < 16; i++) {
+		u += hr[i];
+		h[i] = (unsigned char)u & 0xff;
+		u >>= 8;
+	}
+	u += hr[16];
+	h[16] = (unsigned char)u & 0x03;
+	u >>= 2;
+	u += (u << 2); /* u *= 5; */
+	for (i = 0; i < 16; i++) {
+		u += h[i];
+		h[i] = (unsigned char)u & 0xff;
+		u >>= 8;
+	}
+	h[16] += (unsigned char)u;
+}
+
+static void
+poly1305_freeze(unsigned char h[17]) {
+	static const unsigned char minusp[17] = {
+		0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0xfc
+	};
+	unsigned char horig[17], negative;
+	unsigned int i;
+
+	/* compute h + -p */
+	for (i = 0; i < 17; i++)
+		horig[i] = h[i];
+	poly1305_add(h, minusp);
+
+	/* select h if h < p, or h + -p if h >= p */
+	negative = -(h[16] >> 7);
+	for (i = 0; i < 17; i++)
+		h[i] ^= negative & (horig[i] ^ h[i]);
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
+	const unsigned char hibit = st->final ^ 1; /* 1 << 128 */
+
+	while (bytes >= poly1305_block_size) {
+		unsigned long hr[17], u;
+		unsigned char c[17];
+		unsigned int i, j;
+
+		/* h += m */
+		for (i = 0; i < 16; i++)
+			c[i] = m[i];
+		c[16] = hibit;
+		poly1305_add(st->h, c);
+
+		/* h *= r */
+		for (i = 0; i < 17; i++) {
+			u = 0;
+			for (j = 0; j <= i ; j++) {
+				u += (unsigned short)st->h[j] * st->r[i - j];
+			}
+			for (j = i + 1; j < 17; j++) {
+				unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j];
+				v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */
+				u += v;
+			}
+			hr[i] = u;
+		}
+
+		/* (partial) h %= p */
+		poly1305_squeeze(st->h, hr);
+
+		m += poly1305_block_size;
+		bytes -= poly1305_block_size;
+	}
+}
+
+POLY1305_NOINLINE void
+poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	size_t i;
+
+	/* process the remaining block */
+	if (st->leftover) {
+		size_t i = st->leftover;
+		st->buffer[i++] = 1;
+		for (; i < poly1305_block_size; i++)
+			st->buffer[i] = 0;
+		st->final = 1;
+		poly1305_blocks(st, st->buffer, poly1305_block_size);
+	}
+
+	/* fully reduce h */
+	poly1305_freeze(st->h);
+
+	/* h = (h + pad) % (1 << 128) */
+	poly1305_add(st->h, st->pad);
+	for (i = 0; i < 16; i++)
+		mac[i] = st->h[i];
+
+	/* zero out the state */
+	for (i = 0; i < 17; i++)
+		st->h[i] = 0;
+	for (i = 0; i < 17; i++)
+		st->r[i] = 0;
+	for (i = 0; i < 17; i++)
+		st->pad[i] = 0;
+}
diff --git a/src/libcryptobox/poly1305/poly1305-donna.c b/src/libcryptobox/poly1305/poly1305-donna.c
new file mode 100644
index 000000000..c1e3c74b5
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna.c
@@ -0,0 +1,201 @@
+#include "poly1305-donna.h"
+
+#if defined(POLY1305_8BIT)
+#include "poly1305-donna-8.h"
+#elif defined(POLY1305_16BIT)
+#include "poly1305-donna-16.h"
+#elif defined(POLY1305_32BIT)
+#include "poly1305-donna-32.h"
+#elif defined(POLY1305_64BIT)
+#include "poly1305-donna-64.h"
+#else
+
+/* auto detect between 32bit / 64bit */
+#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__))
+#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64))
+#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4))))
+
+#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT)
+#include "poly1305-donna-64.h"
+#else
+#include "poly1305-donna-32.h"
+#endif
+
+#endif
+
+void
+poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) {
+	poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
+	size_t i;
+
+	/* handle leftover */
+	if (st->leftover) {
+		size_t want = (poly1305_block_size - st->leftover);
+		if (want > bytes)
+			want = bytes;
+		for (i = 0; i < want; i++)
+			st->buffer[st->leftover + i] = m[i];
+		bytes -= want;
+		m += want;
+		st->leftover += want;
+		if (st->leftover < poly1305_block_size)
+			return;
+		poly1305_blocks(st, st->buffer, poly1305_block_size);
+		st->leftover = 0;
+	}
+
+	/* process full blocks */
+	if (bytes >= poly1305_block_size) {
+		size_t want = (bytes & ~(poly1305_block_size - 1));
+		poly1305_blocks(st, m, want);
+		m += want;
+		bytes -= want;
+	}
+
+	/* store leftover */
+	if (bytes) {
+		for (i = 0; i < bytes; i++)
+			st->buffer[st->leftover + i] = m[i];
+		st->leftover += bytes;
+	}
+}
+
+void
+poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) {
+	poly1305_context ctx;
+	poly1305_init(&ctx, key);
+	poly1305_update(&ctx, m, bytes);
+	poly1305_finish(&ctx, mac);
+}
+
+int
+poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) {
+	size_t i;
+	unsigned int dif = 0;
+	for (i = 0; i < 16; i++)
+		dif |= (mac1[i] ^ mac2[i]);
+	dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
+	return (dif & 1);
+}
+
+
+/* test a few basic operations */
+int
+poly1305_power_on_self_test(void) {
+	/* example from nacl */
+	static const unsigned char nacl_key[32] = {
+		0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
+		0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
+		0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
+		0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
+	};
+
+	static const unsigned char nacl_msg[131] = {
+		0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
+		0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
+		0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
+		0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
+		0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
+		0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
+		0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
+		0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
+		0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
+		0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
+		0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
+		0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
+		0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
+		0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
+		0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
+		0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
+		0xe3,0x55,0xa5
+	};
+
+	static const unsigned char nacl_mac[16] = {
+		0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
+		0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9
+	};
+
+	/* generates a final value of (2^130 - 2) == 3 */
+	static const unsigned char wrap_key[32] = {
+		0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	};
+
+	static const unsigned char wrap_msg[16] = {
+		0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+		0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	};
+
+	static const unsigned char wrap_mac[16] = {
+		0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+	};
+
+	/*
+		mac of the macs of messages of length 0 to 256, where the key and messages
+		have all their values set to the length
+	*/
+	static const unsigned char total_key[32] = {
+		0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+		0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,
+		0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+		0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	};
+
+	static const unsigned char total_mac[16] = {
+		0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd,
+		0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39
+	};
+
+	poly1305_context ctx;
+	poly1305_context total_ctx;
+	unsigned char all_key[32];
+	unsigned char all_msg[256];
+	unsigned char mac[16];
+	size_t i, j;
+	int result = 1;
+
+	for (i = 0; i < sizeof(mac); i++)
+		mac[i] = 0;
+	poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key);
+	result &= poly1305_verify(nacl_mac, mac);
+
+	for (i = 0; i < sizeof(mac); i++)
+		mac[i] = 0;
+	poly1305_init(&ctx, nacl_key);
+	poly1305_update(&ctx, nacl_msg +   0, 32);
+	poly1305_update(&ctx, nacl_msg +  32, 64);
+	poly1305_update(&ctx, nacl_msg +  96, 16);
+	poly1305_update(&ctx, nacl_msg + 112,  8);
+	poly1305_update(&ctx, nacl_msg + 120,  4);
+	poly1305_update(&ctx, nacl_msg + 124,  2);
+	poly1305_update(&ctx, nacl_msg + 126,  1);
+	poly1305_update(&ctx, nacl_msg + 127,  1);
+	poly1305_update(&ctx, nacl_msg + 128,  1);
+	poly1305_update(&ctx, nacl_msg + 129,  1);
+	poly1305_update(&ctx, nacl_msg + 130,  1);
+	poly1305_finish(&ctx, mac);
+	result &= poly1305_verify(nacl_mac, mac);
+
+	for (i = 0; i < sizeof(mac); i++)
+		mac[i] = 0;
+	poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key);
+	result &= poly1305_verify(wrap_mac, mac);
+
+	poly1305_init(&total_ctx, total_key);
+	for (i = 0; i < 256; i++) {
+		/* set key and message to 'i,i,i..' */
+		for (j = 0; j < sizeof(all_key); j++)
+			all_key[j] = i;
+		for (j = 0; j < i; j++)
+			all_msg[j] = i;
+		poly1305_auth(mac, all_msg, i, all_key);
+		poly1305_update(&total_ctx, mac, 16);
+	}
+	poly1305_finish(&total_ctx, mac);
+	result &= poly1305_verify(total_mac, mac);
+
+	return result;
+}
diff --git a/src/libcryptobox/poly1305/poly1305-donna.h b/src/libcryptobox/poly1305/poly1305-donna.h
new file mode 100644
index 000000000..94e23533f
--- /dev/null
+++ b/src/libcryptobox/poly1305/poly1305-donna.h
@@ -0,0 +1,20 @@
+#ifndef POLY1305_DONNA_H
+#define POLY1305_DONNA_H
+
+#include <stddef.h>
+
+typedef struct poly1305_context {
+	size_t aligner;
+	unsigned char opaque[136];
+} poly1305_context;
+
+void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);
+void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);
+void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);
+void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);
+
+int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);
+int poly1305_power_on_self_test(void);
+
+#endif /* POLY1305_DONNA_H */
+
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-06 14:10:44 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-02-06 14:10:44 +0000
commit	078ff20d758b7d3219f5c10735b3b1757db1eff0 (patch)
tree	29fd7d08ccecba35ca961dd5e70406ca24504fcf
parent	82f9e6dff521cd21d7d00939a8093ad00197c61b (diff)
download	rspamd-078ff20d758b7d3219f5c10735b3b1757db1eff0.tar.gz rspamd-078ff20d758b7d3219f5c10735b3b1757db1eff0.zip