diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/libcryptobox/CMakeLists.txt | 9 | ||||
-rw-r--r-- | src/libcryptobox/curve25519/LICENSE.md | 46 | ||||
-rw-r--r-- | src/libcryptobox/curve25519/curve25519-donna-c64.c | 500 | ||||
-rw-r--r-- | src/libcryptobox/curve25519/curve25519-donna.c | 912 | ||||
-rw-r--r-- | src/libcryptobox/curve25519/curve25519.h | 10 | ||||
-rw-r--r-- | src/libcryptobox/curve25519/ref.c | 323 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/README.md | 112 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-16.h | 202 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-32.h | 219 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-64.h | 224 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-8.h | 186 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna.c | 201 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna.h | 20 |
13 files changed, 2963 insertions, 1 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt index 311de754e..cf87c2604 100644 --- a/src/libcryptobox/CMakeLists.txt +++ b/src/libcryptobox/CMakeLists.txt @@ -4,6 +4,7 @@ INCLUDE(AsmOp.cmake) TARGET_ARCHITECTURE(ARCH) SET(CHACHASRC chacha20/ref.c) +SET(POLYSRC poly1305/poly1305-donna.c) # For now we support only x86_64 architecture with optimizations IF(${ARCH} STREQUAL "x86_64") @@ -25,6 +26,11 @@ IF(${ARCH} STREQUAL "x86_64") " "dollar macro convention") CONFIGURE_FILE(platform_config.h.in platform_config.h) INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}") + SET(CURVESRC curve25519/curve25519-donna-c64.c) +ELSEIF(${ARCH} STREQUAL "i386") + SET(CURVESRC curve25519/curve25519-donna.c) +ELSE() + SET(CURVESRC curve25519/ref.c) ENDIF() IF(HAVE_AVX2) @@ -39,7 +45,8 @@ ENDIF(HAVE_SSE2) SET(LIBCRYPTOBOXSRC cryptobox.c) -ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} ${CHACHASRC}) +ADD_LIBRARY(rspamd-cryptobox ${LINK_TYPE} ${LIBCRYPTOBOXSRC} + ${CHACHASRC} ${POLYSRC} ${CURVESRC}) IF(NOT DEBIAN_BUILD) SET_TARGET_PROPERTIES(rspamd-cryptobox PROPERTIES VERSION ${RSPAMD_VERSION}) ENDIF(NOT DEBIAN_BUILD) diff --git a/src/libcryptobox/curve25519/LICENSE.md b/src/libcryptobox/curve25519/LICENSE.md new file mode 100644 index 000000000..33a3240ae --- /dev/null +++ b/src/libcryptobox/curve25519/LICENSE.md @@ -0,0 +1,46 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. +* Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +curve25519-donna: Curve25519 elliptic curve, public key function + +http://code.google.com/p/curve25519-donna/ + +Adam Langley <agl@imperialviolet.org> + +Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + +More information about curve25519 can be found here + http://cr.yp.to/ecdh.html + +djb's sample implementation of curve25519 is written in a special assembly +language called qhasm and uses the floating point registers. + +This is, almost, a clean room reimplementation from the curve25519 paper. It +uses many of the tricks described therein. Only the crecip function is taken +from the sample implementation. diff --git a/src/libcryptobox/curve25519/curve25519-donna-c64.c b/src/libcryptobox/curve25519/curve25519-donna-c64.c new file mode 100644 index 000000000..2d693ee44 --- /dev/null +++ b/src/libcryptobox/curve25519/curve25519-donna-c64.c @@ -0,0 +1,500 @@ +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Code released into the public domain. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley <agl@imperialviolet.org> + * + * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. + */ + +#include <string.h> +#include <stdint.h> + +typedef uint8_t u8; +typedef uint64_t limb; +typedef limb felem[5]; +// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit +// platforms only as far as I know. +typedef unsigned uint128_t __attribute__((mode(TI))); + +#undef force_inline +#define force_inline __attribute__((always_inline)) + +/* Sum two numbers: output += in */ +static inline void force_inline +fsum (limb *output, const limb *in) +{ + output[0] += in[0]; + output[1] += in[1]; + output[2] += in[2]; + output[3] += in[3]; + output[4] += in[4]; +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!) + * + * Assumes that out[i] < 2**52 + * On return, out[i] < 2**55 + */ +static inline void force_inline +fdifference_backwards (felem out, const felem in) +{ + /* 152 is 19 << 3 */ + static const limb two54m152 = (((limb) 1) << 54) - 152; + static const limb two54m8 = (((limb) 1) << 54) - 8; + + out[0] = in[0] + two54m152 - out[0]; + out[1] = in[1] + two54m8 - out[1]; + out[2] = in[2] + two54m8 - out[2]; + out[3] = in[3] + two54m8 - out[3]; + out[4] = in[4] + two54m8 - out[4]; +} + +/* Multiply a number by a scalar: output = in * scalar */ +static inline void force_inline +fscalar_product (felem output, const felem in, const limb scalar) +{ + uint128_t a; + + a = ((uint128_t) in[0]) * scalar; + output[0] = ((limb) a) & 0x7ffffffffffff; + + a = ((uint128_t) in[1]) * scalar + ((limb) (a >> 51)); + output[1] = ((limb) a) & 0x7ffffffffffff; + + a = ((uint128_t) in[2]) * scalar + ((limb) (a >> 51)); + output[2] = ((limb) a) & 0x7ffffffffffff; + + a = ((uint128_t) in[3]) * scalar + ((limb) (a >> 51)); + output[3] = ((limb) a) & 0x7ffffffffffff; + + a = ((uint128_t) in[4]) * scalar + ((limb) (a >> 51)); + output[4] = ((limb) a) & 0x7ffffffffffff; + + output[0] += (a >> 51) * 19; +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * Assumes that in[i] < 2**55 and likewise for in2. + * On return, output[i] < 2**52 + */ +static inline void force_inline +fmul (felem output, const felem in2, const felem in) +{ + uint128_t t[5]; + limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + s0 = in2[0]; + s1 = in2[1]; + s2 = in2[2]; + s3 = in2[3]; + s4 = in2[4]; + + t[0] = ((uint128_t) r0) * s0; + t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0; + t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + + ((uint128_t) r1) * s1; + t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + + ((uint128_t) r2) * s1; + t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2; + + r4 *= 19; + r1 *= 19; + r2 *= 19; + r3 *= 19; + + t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2; + t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + + ((uint128_t) r3) * s3; + t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4; + t[3] += ((uint128_t) r4) * s4; + + r0 = (limb) t[0] & 0x7ffffffffffff; + c = (limb) (t[0] >> 51); + t[1] += c; + r1 = (limb) t[1] & 0x7ffffffffffff; + c = (limb) (t[1] >> 51); + t[2] += c; + r2 = (limb) t[2] & 0x7ffffffffffff; + c = (limb) (t[2] >> 51); + t[3] += c; + r3 = (limb) t[3] & 0x7ffffffffffff; + c = (limb) (t[3] >> 51); + t[4] += c; + r4 = (limb) t[4] & 0x7ffffffffffff; + c = (limb) (t[4] >> 51); + r0 += c * 19; + c = r0 >> 51; + r0 = r0 & 0x7ffffffffffff; + r1 += c; + c = r1 >> 51; + r1 = r1 & 0x7ffffffffffff; + r2 += c; + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +static inline void force_inline +fsquare_times (felem output, const felem in, limb count) +{ + uint128_t t[5]; + limb r0, r1, r2, r3, r4, c; + limb d0, d1, d2, d4, d419; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + do { + d0 = r0 * 2; + d1 = r1 * 2; + d2 = r2 * 2 * 19; + d419 = r4 * 19; + d4 = d419 * 2; + + t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + + (((uint128_t) d2) * (r3)); + t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + + (((uint128_t) r3) * (r3 * 19)); + t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + + (((uint128_t) d4) * (r3)); + t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + + (((uint128_t) r4) * (d419)); + t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + + (((uint128_t) r2) * (r2)); + + r0 = (limb) t[0] & 0x7ffffffffffff; + c = (limb) (t[0] >> 51); + t[1] += c; + r1 = (limb) t[1] & 0x7ffffffffffff; + c = (limb) (t[1] >> 51); + t[2] += c; + r2 = (limb) t[2] & 0x7ffffffffffff; + c = (limb) (t[2] >> 51); + t[3] += c; + r3 = (limb) t[3] & 0x7ffffffffffff; + c = (limb) (t[3] >> 51); + t[4] += c; + r4 = (limb) t[4] & 0x7ffffffffffff; + c = (limb) (t[4] >> 51); + r0 += c * 19; + c = r0 >> 51; + r0 = r0 & 0x7ffffffffffff; + r1 += c; + c = r1 >> 51; + r1 = r1 & 0x7ffffffffffff; + r2 += c; + } while (--count); + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +/* Load a little-endian 64-bit number */ +static limb load_limb (const u8 *in) +{ + return ((limb) in[0]) | (((limb) in[1]) << 8) | (((limb) in[2]) << 16) + | (((limb) in[3]) << 24) | (((limb) in[4]) << 32) + | (((limb) in[5]) << 40) | (((limb) in[6]) << 48) + | (((limb) in[7]) << 56); +} + +static void store_limb (u8 *out, limb in) +{ + out[0] = in & 0xff; + out[1] = (in >> 8) & 0xff; + out[2] = (in >> 16) & 0xff; + out[3] = (in >> 24) & 0xff; + out[4] = (in >> 32) & 0xff; + out[5] = (in >> 40) & 0xff; + out[6] = (in >> 48) & 0xff; + out[7] = (in >> 56) & 0xff; +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static void fexpand (limb *output, const u8 *in) +{ + output[0] = load_limb (in) & 0x7ffffffffffff; + output[1] = (load_limb (in + 6) >> 3) & 0x7ffffffffffff; + output[2] = (load_limb (in + 12) >> 6) & 0x7ffffffffffff; + output[3] = (load_limb (in + 19) >> 1) & 0x7ffffffffffff; + output[4] = (load_limb (in + 24) >> 12) & 0x7ffffffffffff; +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array + */ +static void fcontract (u8 *output, const felem input) +{ + uint128_t t[5]; + + t[0] = input[0]; + t[1] = input[1]; + t[2] = input[2]; + t[3] = input[3]; + t[4] = input[4]; + + t[1] += t[0] >> 51; + t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; + t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; + t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; + t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); + t[4] &= 0x7ffffffffffff; + + t[1] += t[0] >> 51; + t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; + t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; + t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; + t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); + t[4] &= 0x7ffffffffffff; + + /* now t is between 0 and 2^255-1, properly carried. */ + /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ + + t[0] += 19; + + t[1] += t[0] >> 51; + t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; + t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; + t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; + t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); + t[4] &= 0x7ffffffffffff; + + /* now between 19 and 2^255-1 in both cases, and offset by 19. */ + + t[0] += 0x8000000000000 - 19; + t[1] += 0x8000000000000 - 1; + t[2] += 0x8000000000000 - 1; + t[3] += 0x8000000000000 - 1; + t[4] += 0x8000000000000 - 1; + + /* now between 2^255 and 2^256-20, and offset by 2^255. */ + + t[1] += t[0] >> 51; + t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; + t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; + t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; + t[3] &= 0x7ffffffffffff; + t[4] &= 0x7ffffffffffff; + + store_limb (output, t[0] | (t[1] << 51)); + store_limb (output + 8, (t[1] >> 13) | (t[2] << 38)); + store_limb (output + 16, (t[2] >> 26) | (t[3] << 25)); + store_limb (output + 24, (t[3] >> 39) | (t[4] << 12)); +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + */ +static void fmonty (limb *x2, limb *z2, /* output 2Q */ +limb *x3, limb *z3, /* output Q + Q' */ +limb *x, limb *z, /* input Q */ +limb *xprime, limb *zprime, /* input Q' */ +const limb *qmqp /* input Q - Q' */) +{ + limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5], + zzzprime[5]; + + memcpy (origx, x, 5 * sizeof(limb)); + fsum (x, z); + fdifference_backwards (z, origx); // does x - z + + memcpy (origxprime, xprime, sizeof(limb) * 5); + fsum (xprime, zprime); + fdifference_backwards (zprime, origxprime); + fmul (xxprime, xprime, z); + fmul (zzprime, x, zprime); + memcpy (origxprime, xxprime, sizeof(limb) * 5); + fsum (xxprime, zzprime); + fdifference_backwards (zzprime, origxprime); + fsquare_times (x3, xxprime, 1); + fsquare_times (zzzprime, zzprime, 1); + fmul (z3, zzzprime, qmqp); + + fsquare_times (xx, x, 1); + fsquare_times (zz, z, 1); + fmul (x2, xx, zz); + fdifference_backwards (zz, xx); // does zz = xx - zz + fscalar_product (zzz, zz, 121665); + fsum (zzz, xx); + fmul (z2, zz, zzz); +} + +// ----------------------------------------------------------------------------- +// Maybe swap the contents of two limb arrays (@a and @b), each @len elements +// long. Perform the swap iff @swap is non-zero. +// +// This function performs the swap without leaking any side-channel +// information. +// ----------------------------------------------------------------------------- +static void swap_conditional (limb a[5], limb b[5], limb iswap) +{ + unsigned i; + const limb swap = -iswap; + + for (i = 0; i < 5; ++i) { + const limb x = swap & (a[i] ^ b[i]); + a[i] ^= x; + b[i] ^= x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) + */ +static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q) +{ + limb a[5] = { 0 }, b[5] = { 1 }, c[5] = { 1 }, d[5] = { 0 }; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[5] = { 0 }, f[5] = { 1 }, g[5] = { 0 }, h[5] = { 1 }; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned i, j; + + memcpy (nqpqx, q, sizeof(limb) * 5); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional (nqx, nqpqx, bit); + swap_conditional (nqz, nqpqz, bit); + fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q); + swap_conditional (nqx2, nqpqx2, bit); + swap_conditional (nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy (resultx, nqx, sizeof(limb) * 5); + memcpy (resultz, nqz, sizeof(limb) * 5); +} + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code, tightened a little +// ----------------------------------------------------------------------------- +static void crecip (felem out, const felem z) +{ + felem a, t0, b, c; + + /* 2 */fsquare_times (a, z, 1); // a = 2 + /* 8 */fsquare_times (t0, a, 2); + /* 9 */fmul (b, t0, z); // b = 9 + /* 11 */fmul (a, b, a); // a = 11 + /* 22 */fsquare_times (t0, a, 1); + /* 2^5 - 2^0 = 31 */fmul (b, t0, b); + /* 2^10 - 2^5 */fsquare_times (t0, b, 5); + /* 2^10 - 2^0 */fmul (b, t0, b); + /* 2^20 - 2^10 */fsquare_times (t0, b, 10); + /* 2^20 - 2^0 */fmul (c, t0, b); + /* 2^40 - 2^20 */fsquare_times (t0, c, 20); + /* 2^40 - 2^0 */fmul (t0, t0, c); + /* 2^50 - 2^10 */fsquare_times (t0, t0, 10); + /* 2^50 - 2^0 */fmul (b, t0, b); + /* 2^100 - 2^50 */fsquare_times (t0, b, 50); + /* 2^100 - 2^0 */fmul (c, t0, b); + /* 2^200 - 2^100 */fsquare_times (t0, c, 100); + /* 2^200 - 2^0 */fmul (t0, t0, c); + /* 2^250 - 2^50 */fsquare_times (t0, t0, 50); + /* 2^250 - 2^0 */fmul (t0, t0, b); + /* 2^255 - 2^5 */fsquare_times (t0, t0, 5); + /* 2^255 - 21 */fmul (out, t0, a); +} + +int curve25519 (u8 *, const u8 *, const u8 *); + +int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint) +{ + limb bp[5], x[5], z[5], zmone[5]; + uint8_t e[32]; + int i; + + for (i = 0; i < 32; ++i) + e[i] = secret[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand (bp, basepoint); + cmult (x, z, e, bp); + crecip (zmone, z); + fmul (z, x, zmone); + fcontract (mypublic, z); + return 0; +} diff --git a/src/libcryptobox/curve25519/curve25519-donna.c b/src/libcryptobox/curve25519/curve25519-donna.c new file mode 100644 index 000000000..f9f19a632 --- /dev/null +++ b/src/libcryptobox/curve25519/curve25519-donna.c @@ -0,0 +1,912 @@ +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley <agl@imperialviolet.org> + * + * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. */ + +#include <string.h> +#include <stdint.h> + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef uint8_t u8; +typedef int32_t s32; +typedef int64_t limb; + +/* Field element representation: + * + * Field elements are written as an array of signed, 64-bit limbs, least + * significant first. The value of the field element is: + * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... + * + * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ + +/* Sum two numbers: output += in */ +static void fsum (limb *output, const limb *in) +{ + unsigned i; + for (i = 0; i < 10; i += 2) { + output[0 + i] = output[0 + i] + in[0 + i]; + output[1 + i] = output[1 + i] + in[1 + i]; + } +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!). */ +static void fdifference (limb *output, const limb *in) +{ + unsigned i; + for (i = 0; i < 10; ++i) { + output[i] = in[i] - output[i]; + } +} + +/* Multiply a number by a scalar: output = in * scalar */ +static void fscalar_product (limb *output, const limb *in, const limb scalar) +{ + unsigned i; + for (i = 0; i < 10; ++i) { + output[i] = in[i] * scalar; + } +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +static void fproduct (limb *output, const limb *in2, const limb *in) +{ + output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); + output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + + ((limb) ((s32) in2[1])) * ((s32) in[0]); + output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[0]); + output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[0]); + output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + + 2 + * (((limb) ((s32) in2[1])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[1])) + + ((limb) ((s32) in2[0])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[0]); + output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[0]); + output[6] = 2 + * (((limb) ((s32) in2[3])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[0]); + output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[0]); + output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + + 2 + * (((limb) ((s32) in2[3])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[0]); + output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[0]); + output[10] = 2 + * (((limb) ((s32) in2[5])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[1])) + + ((limb) ((s32) in2[4])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[4]) + + ((limb) ((s32) in2[2])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[2]); + output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[2]); + output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + + 2 + * (((limb) ((s32) in2[5])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[3])) + + ((limb) ((s32) in2[4])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[4]); + output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[6]) + + ((limb) ((s32) in2[5])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[4]); + output[14] = 2 + * (((limb) ((s32) in2[7])) * ((s32) in[7]) + + ((limb) ((s32) in2[5])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[5])) + + ((limb) ((s32) in2[6])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[6]); + output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[7]) + + ((limb) ((s32) in2[6])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[6]); + output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + + 2 + * (((limb) ((s32) in2[7])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[7])); + output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[8]); + output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); +} + +/* Reduce a long form to a short form by taking the input mod 2^255 - 19. + * + * On entry: |output[i]| < 14*2^54 + * On exit: |output[0..8]| < 280*2^54 */ +static void freduce_degree (limb *output) +{ + /* Each of these shifts and adds ends up multiplying the value by 19. + * + * For output[0..8], the absolute entry value is < 14*2^54 and we add, at + * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ + output[8] += output[18] << 4; + output[8] += output[18] << 1; + output[8] += output[18]; + output[7] += output[17] << 4; + output[7] += output[17] << 1; + output[7] += output[17]; + output[6] += output[16] << 4; + output[6] += output[16] << 1; + output[6] += output[16]; + output[5] += output[15] << 4; + output[5] += output[15] << 1; + output[5] += output[15]; + output[4] += output[14] << 4; + output[4] += output[14] << 1; + output[4] += output[14]; + output[3] += output[13] << 4; + output[3] += output[13] << 1; + output[3] += output[13]; + output[2] += output[12] << 4; + output[2] += output[12] << 1; + output[2] += output[12]; + output[1] += output[11] << 4; + output[1] += output[11] << 1; + output[1] += output[11]; + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; +} + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +/* return v / 2^26, using only shifts and adds. + * + * On entry: v can take any value. */ +static inline limb div_by_2_26 (const limb v) +{ + /* High word of v; no shift needed. */ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x3ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 6; + /* Should return v / (1<<26) */ + return (v + roundoff) >> 26; +} + +/* return v / (2^25), using only shifts and adds. + * + * On entry: v can take any value. */ +static inline limb div_by_2_25 (const limb v) +{ + /* High word of v; no shift needed*/ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x1ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 7; + /* Should return v / (1<<25) */ + return (v + roundoff) >> 25; +} + +/* Reduce all coefficients of the short form input so that |x| < 2^26. + * + * On entry: |output[i]| < 280*2^54 */ +static void freduce_coefficients (limb *output) +{ + unsigned i; + + output[10] = 0; + + for (i = 0; i < 10; i += 2) { + limb over = div_by_2_26 (output[i]); + /* The entry condition (that |output[i]| < 280*2^54) means that over is, at + * most, 280*2^28 in the first iteration of this loop. This is added to the + * next limb and we can approximate the resulting bound of that limb by + * 281*2^54. */ + output[i] -= over << 26; + output[i + 1] += over; + + /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < + * 281*2^29. When this is added to the next limb, the resulting bound can + * be approximated as 281*2^54. + * + * For subsequent iterations of the loop, 281*2^54 remains a conservative + * bound and no overflow occurs. */ + over = div_by_2_25 (output[i + 1]); + output[i + 1] -= over << 25; + output[i + 2] += over; + } + /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; + + output[10] = 0; + + /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 + * So |over| will be no more than 2^16. */ + { + limb over = div_by_2_26 (output[0]); + output[0] -= over << 26; + output[1] += over; + } + + /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The + * bound on |output[1]| is sufficient to meet our needs. */ +} + +/* A helpful wrapper around fproduct: output = in * in2. + * + * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. + * + * output must be distinct to both inputs. The output is reduced degree + * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ +static void fmul (limb *output, const limb *in, const limb *in2) +{ + limb t[19]; + fproduct (t, in, in2); + /* |t[i]| < 14*2^54 */ + freduce_degree (t); + freduce_coefficients (t); + /* |t[i]| < 2^26 */ + memcpy (output, t, sizeof(limb) * 10); +} + +/* Square a number: output = in**2 + * + * output must be distinct from the input. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +static void fsquare_inner (limb *output, const limb *in) +{ + output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); + output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); + output[2] = 2 + * (((limb) ((s32) in[1])) * ((s32) in[1]) + + ((limb) ((s32) in[0])) * ((s32) in[2])); + output[3] = 2 + * (((limb) ((s32) in[1])) * ((s32) in[2]) + + ((limb) ((s32) in[0])) * ((s32) in[3])); + output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + + 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + + 2 * ((limb) ((s32) in[0])) * ((s32) in[4]); + output[5] = 2 + * (((limb) ((s32) in[2])) * ((s32) in[3]) + + ((limb) ((s32) in[1])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[5])); + output[6] = 2 + * (((limb) ((s32) in[3])) * ((s32) in[3]) + + ((limb) ((s32) in[2])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[6]) + + 2 * ((limb) ((s32) in[1])) * ((s32) in[5])); + output[7] = 2 + * (((limb) ((s32) in[3])) * ((s32) in[4]) + + ((limb) ((s32) in[2])) * ((s32) in[5]) + + ((limb) ((s32) in[1])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[7])); + output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + + 2 + * (((limb) ((s32) in[2])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[8]) + + 2 + * (((limb) ((s32) in[1])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) + * ((s32) in[5]))); + output[9] = 2 + * (((limb) ((s32) in[4])) * ((s32) in[5]) + + ((limb) ((s32) in[3])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[8]) + + ((limb) ((s32) in[0])) * ((s32) in[9])); + output[10] = 2 + * (((limb) ((s32) in[5])) * ((s32) in[5]) + + ((limb) ((s32) in[4])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[8]) + + 2 + * (((limb) ((s32) in[3])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[9]))); + output[11] = 2 + * (((limb) ((s32) in[5])) * ((s32) in[6]) + + ((limb) ((s32) in[4])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[8]) + + ((limb) ((s32) in[2])) * ((s32) in[9])); + output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + + 2 + * (((limb) ((s32) in[4])) * ((s32) in[8]) + + 2 + * (((limb) ((s32) in[5])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) + * ((s32) in[9]))); + output[13] = 2 + * (((limb) ((s32) in[6])) * ((s32) in[7]) + + ((limb) ((s32) in[5])) * ((s32) in[8]) + + ((limb) ((s32) in[4])) * ((s32) in[9])); + output[14] = 2 + * (((limb) ((s32) in[7])) * ((s32) in[7]) + + ((limb) ((s32) in[6])) * ((s32) in[8]) + + 2 * ((limb) ((s32) in[5])) * ((s32) in[9])); + output[15] = 2 + * (((limb) ((s32) in[7])) * ((s32) in[8]) + + ((limb) ((s32) in[6])) * ((s32) in[9])); + output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + + 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); + output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); + output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); +} + +/* fsquare sets output = in^2. + * + * On entry: The |in| argument is in reduced coefficients form and |in[i]| < + * 2^27. + * + * On exit: The |output| argument is in reduced coefficients form (indeed, one + * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ +static void fsquare (limb *output, const limb *in) +{ + limb t[19]; + fsquare_inner (t, in); + /* |t[i]| < 14*2^54 because the largest product of two limbs will be < + * 2^(27+27) and fsquare_inner adds together, at most, 14 of those + * products. */ + freduce_degree (t); + freduce_coefficients (t); + /* |t[i]| < 2^26 */ + memcpy (output, t, sizeof(limb) * 10); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static void fexpand (limb *output, const u8 *input) +{ +#define F(n,start,shift,mask) \ + output[n] = ((((limb) input[start + 0]) | \ + ((limb) input[start + 1]) << 8 | \ + ((limb) input[start + 2]) << 16 | \ + ((limb) input[start + 3]) << 24) >> shift) & mask; + F(0, 0, 0, 0x3ffffff); + F(1, 3, 2, 0x1ffffff); + F(2, 6, 3, 0x3ffffff); + F(3, 9, 5, 0x1ffffff); + F(4, 12, 6, 0x3ffffff); + F(5, 16, 0, 0x1ffffff); + F(6, 19, 1, 0x3ffffff); + F(7, 22, 3, 0x1ffffff); + F(8, 25, 4, 0x3ffffff); + F(9, 28, 6, 0x1ffffff); +#undef F +} + +#if (-32 >> 1) != -16 +#error "This code only works when >> does sign-extension on negative numbers" +#endif + +/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ +static s32 s32_eq (s32 a, s32 b) +{ + a = ~(a ^ b); + a &= a << 16; + a &= a << 8; + a &= a << 4; + a &= a << 2; + a &= a << 1; + return a >> 31; +} + +/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are + * both non-negative. */ +static s32 s32_gte (s32 a, s32 b) +{ + a -= b; + /* a >= 0 iff a >= b. */ + return ~(a >> 31); +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array. + * + * On entry: |input_limbs[i]| < 2^26 */ +static void fcontract (u8 *output, limb *input_limbs) +{ + int i; + int j; + s32 input[10]; + s32 mask; + + /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ + for (i = 0; i < 10; i++) { + input[i] = input_limbs[i]; + } + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 9; ++i) { + if ((i & 1) == 1) { + /* This calculation is a time-invariant way to make input[i] + * non-negative by borrowing from the next-larger limb. */ + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 25); + input[i] = input[i] + (carry << 25); + input[i + 1] = input[i + 1] - carry; + } + else { + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 26); + input[i] = input[i] + (carry << 26); + input[i + 1] = input[i + 1] - carry; + } + } + + /* There's no greater limb for input[9] to borrow from, but we can multiply + * by 19 and borrow from input[0], which is valid mod 2^255-19. */ + { + const s32 mask = input[9] >> 31; + const s32 carry = -((input[9] & mask) >> 25); + input[9] = input[9] + (carry << 25); + input[0] = input[0] - (carry * 19); + } + + /* After the first iteration, input[1..9] are non-negative and fit within + * 25 or 26 bits, depending on position. However, input[0] may be + * negative. */ + } + + /* The first borrow-propagation pass above ended with every limb + except (possibly) input[0] non-negative. + + If input[0] was negative after the first pass, then it was because of a + carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, + one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. + + In the second pass, each limb is decreased by at most one. Thus the second + borrow-propagation pass could only have wrapped around to decrease + input[0] again if the first pass left input[0] negative *and* input[1] + through input[9] were all zero. In that case, input[1] is now 2^25 - 1, + and this last borrow-propagation step will leave input[1] non-negative. */ + { + const s32 mask = input[0] >> 31; + const s32 carry = -((input[0] & mask) >> 26); + input[0] = input[0] + (carry << 26); + input[1] = input[1] - carry; + } + + /* All input[i] are now non-negative. However, there might be values between + * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ + for (j = 0; j < 2; j++) { + for (i = 0; i < 9; i++) { + if ((i & 1) == 1) { + const s32 carry = input[i] >> 25; + input[i] &= 0x1ffffff; + input[i + 1] += carry; + } + else { + const s32 carry = input[i] >> 26; + input[i] &= 0x3ffffff; + input[i + 1] += carry; + } + } + + { + const s32 carry = input[9] >> 25; + input[9] &= 0x1ffffff; + input[0] += 19 * carry; + } + } + + /* If the first carry-chain pass, just above, ended up with a carry from + * input[9], and that caused input[0] to be out-of-bounds, then input[0] was + * < 2^26 + 2*19, because the carry was, at most, two. + * + * If the second pass carried from input[9] again then input[0] is < 2*19 and + * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ + + /* It still remains the case that input might be between 2^255-19 and 2^255. + * In this case, input[1..9] must take their maximum value and input[0] must + * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ + mask = s32_gte (input[0], 0x3ffffed); + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + mask &= s32_eq (input[i], 0x1ffffff); + } + else { + mask &= s32_eq (input[i], 0x3ffffff); + } + } + + /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus + * this conditionally subtracts 2^255-19. */ + input[0] -= mask & 0x3ffffed; + + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + input[i] -= mask & 0x1ffffff; + } + else { + input[i] -= mask & 0x3ffffff; + } + } + + input[1] <<= 2; + input[2] <<= 3; + input[3] <<= 5; + input[4] <<= 6; + input[6] <<= 1; + input[7] <<= 3; + input[8] <<= 4; + input[9] <<= 6; +#define F(i, s) \ + output[s+0] |= input[i] & 0xff; \ + output[s+1] = (input[i] >> 8) & 0xff; \ + output[s+2] = (input[i] >> 16) & 0xff; \ + output[s+3] = (input[i] >> 24) & 0xff; + output[0] = 0; + output[16] = 0; + F(0, 0); + F(1, 3); + F(2, 6); + F(3, 9); + F(4, 12); + F(5, 16); + F(6, 19); + F(7, 22); + F(8, 25); + F(9, 28); +#undef F +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + * + * On entry and exit, the absolute value of the limbs of all inputs and outputs + * are < 2^26. */ +static void fmonty (limb *x2, limb *z2, /* output 2Q */ +limb *x3, limb *z3, /* output Q + Q' */ +limb *x, limb *z, /* input Q */ +limb *xprime, limb *zprime, /* input Q' */ +const limb *qmqp /* input Q - Q' */) +{ + limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], + zzprime[19], zzzprime[19], xxxprime[19]; + + memcpy (origx, x, 10 * sizeof(limb)); + fsum (x, z); + /* |x[i]| < 2^27 */ + fdifference (z, origx); /* does x - z */ + /* |z[i]| < 2^27 */ + + memcpy (origxprime, xprime, sizeof(limb) * 10); + fsum (xprime, zprime); + /* |xprime[i]| < 2^27 */ + fdifference (zprime, origxprime); + /* |zprime[i]| < 2^27 */ + fproduct (xxprime, xprime, z); + /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < + * 2^(27+27) and fproduct adds together, at most, 14 of those products. + * (Approximating that to 2^58 doesn't work out.) */ + fproduct (zzprime, x, zprime); + /* |zzprime[i]| < 14*2^54 */ + freduce_degree (xxprime); + freduce_coefficients (xxprime); + /* |xxprime[i]| < 2^26 */ + freduce_degree (zzprime); + freduce_coefficients (zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy (origxprime, xxprime, sizeof(limb) * 10); + fsum (xxprime, zzprime); + /* |xxprime[i]| < 2^27 */ + fdifference (zzprime, origxprime); + /* |zzprime[i]| < 2^27 */ + fsquare (xxxprime, xxprime); + /* |xxxprime[i]| < 2^26 */ + fsquare (zzzprime, zzprime); + /* |zzzprime[i]| < 2^26 */ + fproduct (zzprime, zzzprime, qmqp); + /* |zzprime[i]| < 14*2^52 */ + freduce_degree (zzprime); + freduce_coefficients (zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy (x3, xxxprime, sizeof(limb) * 10); + memcpy (z3, zzprime, sizeof(limb) * 10); + + fsquare (xx, x); + /* |xx[i]| < 2^26 */ + fsquare (zz, z); + /* |zz[i]| < 2^26 */ + fproduct (x2, xx, zz); + /* |x2[i]| < 14*2^52 */ + freduce_degree (x2); + freduce_coefficients (x2); + /* |x2[i]| < 2^26 */ + fdifference (zz, xx); // does zz = xx - zz + /* |zz[i]| < 2^27 */ + memset (zzz + 10, 0, sizeof(limb) * 9); + fscalar_product (zzz, zz, 121665); + /* |zzz[i]| < 2^(27+17) */ + /* No need to call freduce_degree here: + fscalar_product doesn't increase the degree of its input. */ + freduce_coefficients (zzz); + /* |zzz[i]| < 2^26 */ + fsum (zzz, xx); + /* |zzz[i]| < 2^27 */ + fproduct (z2, zz, zzz); + /* |z2[i]| < 14*2^(26+27) */ + freduce_degree (z2); + freduce_coefficients (z2); + /* |z2|i| < 2^26 */ +} + +/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave + * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid + * side-channel attacks. + * + * NOTE that this function requires that 'iswap' be 1 or 0; other values give + * wrong results. Also, the two limb arrays must be in reduced-coefficient, + * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, + * and all all values in a[0..9],b[0..9] must have magnitude less than + * INT32_MAX. */ +static void swap_conditional (limb a[19], limb b[19], limb iswap) +{ + unsigned i; + const s32 swap = (s32) -iswap; + + for (i = 0; i < 10; ++i) { + const s32 x = swap & (((s32) a[i]) ^ ((s32) b[i])); + a[i] = ((s32) a[i]) ^ x; + b[i] = ((s32) b[i]) ^ x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) */ +static void cmult (limb *resultx, limb *resultz, const u8 *n, const limb *q) +{ + limb a[19] = { 0 }, b[19] = { 1 }, c[19] = { 1 }, d[19] = { 0 }; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[19] = { 0 }, f[19] = { 1 }, g[19] = { 0 }, h[19] = { 1 }; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned i, j; + + memcpy (nqpqx, q, sizeof(limb) * 10); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional (nqx, nqpqx, bit); + swap_conditional (nqz, nqpqz, bit); + fmonty (nqx2, nqz2, nqpqx2, nqpqz2, nqx, nqz, nqpqx, nqpqz, q); + swap_conditional (nqx2, nqpqx2, bit); + swap_conditional (nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy (resultx, nqx, sizeof(limb) * 10); + memcpy (resultz, nqz, sizeof(limb) * 10); +} + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code +// ----------------------------------------------------------------------------- +static void crecip (limb *out, const limb *z) +{ + limb z2[10]; + limb z9[10]; + limb z11[10]; + limb z2_5_0[10]; + limb z2_10_0[10]; + limb z2_20_0[10]; + limb z2_50_0[10]; + limb z2_100_0[10]; + limb t0[10]; + limb t1[10]; + int i; + + /* 2 */fsquare (z2, z); + /* 4 */fsquare (t1, z2); + /* 8 */fsquare (t0, t1); + /* 9 */fmul (z9, t0, z); + /* 11 */fmul (z11, z9, z2); + /* 22 */fsquare (t0, z11); + /* 2^5 - 2^0 = 31 */fmul (z2_5_0, t0, z9); + + /* 2^6 - 2^1 */fsquare (t0, z2_5_0); + /* 2^7 - 2^2 */fsquare (t1, t0); + /* 2^8 - 2^3 */fsquare (t0, t1); + /* 2^9 - 2^4 */fsquare (t1, t0); + /* 2^10 - 2^5 */fsquare (t0, t1); + /* 2^10 - 2^0 */fmul (z2_10_0, t0, z2_5_0); + + /* 2^11 - 2^1 */fsquare (t0, z2_10_0); + /* 2^12 - 2^2 */fsquare (t1, t0); + /* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) { + fsquare (t0, t1); + fsquare (t1, t0); + } + /* 2^20 - 2^0 */fmul (z2_20_0, t1, z2_10_0); + + /* 2^21 - 2^1 */fsquare (t0, z2_20_0); + /* 2^22 - 2^2 */fsquare (t1, t0); + /* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) { + fsquare (t0, t1); + fsquare (t1, t0); + } + /* 2^40 - 2^0 */fmul (t0, t1, z2_20_0); + + /* 2^41 - 2^1 */fsquare (t1, t0); + /* 2^42 - 2^2 */fsquare (t0, t1); + /* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) { + fsquare (t1, t0); + fsquare (t0, t1); + } + /* 2^50 - 2^0 */fmul (z2_50_0, t0, z2_10_0); + + /* 2^51 - 2^1 */fsquare (t0, z2_50_0); + /* 2^52 - 2^2 */fsquare (t1, t0); + /* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) { + fsquare (t0, t1); + fsquare (t1, t0); + } + /* 2^100 - 2^0 */fmul (z2_100_0, t1, z2_50_0); + + /* 2^101 - 2^1 */fsquare (t1, z2_100_0); + /* 2^102 - 2^2 */fsquare (t0, t1); + /* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) { + fsquare (t1, t0); + fsquare (t0, t1); + } + /* 2^200 - 2^0 */fmul (t1, t0, z2_100_0); + + /* 2^201 - 2^1 */fsquare (t0, t1); + /* 2^202 - 2^2 */fsquare (t1, t0); + /* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) { + fsquare (t0, t1); + fsquare (t1, t0); + } + /* 2^250 - 2^0 */fmul (t0, t1, z2_50_0); + + /* 2^251 - 2^1 */fsquare (t1, t0); + /* 2^252 - 2^2 */fsquare (t0, t1); + /* 2^253 - 2^3 */fsquare (t1, t0); + /* 2^254 - 2^4 */fsquare (t0, t1); + /* 2^255 - 2^5 */fsquare (t1, t0); + /* 2^255 - 21 */fmul (out, t1, z11); +} + +int curve25519 (u8 *mypublic, const u8 *secret, const u8 *basepoint) +{ + limb bp[10], x[10], z[11], zmone[10]; + uint8_t e[32]; + int i; + + for (i = 0; i < 32; ++i) + e[i] = secret[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand (bp, basepoint); + cmult (x, z, e, bp); + crecip (zmone, z); + fmul (z, x, zmone); + fcontract (mypublic, z); + return 0; +} diff --git a/src/libcryptobox/curve25519/curve25519.h b/src/libcryptobox/curve25519/curve25519.h new file mode 100644 index 000000000..2d87e34cb --- /dev/null +++ b/src/libcryptobox/curve25519/curve25519.h @@ -0,0 +1,10 @@ +#ifndef CURVE25519_H +#define CURVE25519_H + +#include "config.h" + +static const guchar curve25519_basepoint[32] = {9}; + +int curve25519 (guchar *mypublic, const guchar *secret, const guchar *basepoint); + +#endif diff --git a/src/libcryptobox/curve25519/ref.c b/src/libcryptobox/curve25519/ref.c new file mode 100644 index 000000000..ac3827c27 --- /dev/null +++ b/src/libcryptobox/curve25519/ref.c @@ -0,0 +1,323 @@ +/* +version 20081011 +Matthew Dempsky +Public domain. +Derived from public domain code by D. J. Bernstein. +20140216 tweak: Mask top bit of point input. +*/ + +static void add (unsigned int out[32], const unsigned int a[32], + const unsigned int b[32]) +{ + unsigned int j; + unsigned int u; + u = 0; + for (j = 0; j < 31; ++j) { + u += a[j] + b[j]; + out[j] = u & 255; + u >>= 8; + } + u += a[31] + b[31]; + out[31] = u; +} + +static void sub (unsigned int out[32], const unsigned int a[32], + const unsigned int b[32]) +{ + unsigned int j; + unsigned int u; + u = 218; + for (j = 0; j < 31; ++j) { + u += a[j] + 65280 - b[j]; + out[j] = u & 255; + u >>= 8; + } + u += a[31] - b[31]; + out[31] = u; +} + +static void squeeze (unsigned int a[32]) +{ + unsigned int j; + unsigned int u; + u = 0; + for (j = 0; j < 31; ++j) { + u += a[j]; + a[j] = u & 255; + u >>= 8; + } + u += a[31]; + a[31] = u & 127; + u = 19 * (u >> 7); + for (j = 0; j < 31; ++j) { + u += a[j]; + a[j] = u & 255; + u >>= 8; + } + u += a[31]; + a[31] = u; +} + +static const unsigned int minusp[32] = { 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 }; + +static void freeze (unsigned int a[32]) +{ + unsigned int aorig[32]; + unsigned int j; + unsigned int negative; + + for (j = 0; j < 32; ++j) + aorig[j] = a[j]; + add (a, a, minusp); + negative = -((a[31] >> 7) & 1); + for (j = 0; j < 32; ++j) + a[j] ^= negative & (aorig[j] ^ a[j]); +} + +static void mult (unsigned int out[32], const unsigned int a[32], + const unsigned int b[32]) +{ + unsigned int i; + unsigned int j; + unsigned int u; + + for (i = 0; i < 32; ++i) { + u = 0; + for (j = 0; j <= i; ++j) + u += a[j] * b[i - j]; + for (j = i + 1; j < 32; ++j) + u += 38 * a[j] * b[i + 32 - j]; + out[i] = u; + } + squeeze (out); +} + +static void mult121665 (unsigned int out[32], const unsigned int a[32]) +{ + unsigned int j; + unsigned int u; + + u = 0; + for (j = 0; j < 31; ++j) { + u += 121665 * a[j]; + out[j] = u & 255; + u >>= 8; + } + u += 121665 * a[31]; + out[31] = u & 127; + u = 19 * (u >> 7); + for (j = 0; j < 31; ++j) { + u += out[j]; + out[j] = u & 255; + u >>= 8; + } + u += out[j]; + out[j] = u; +} + +static void square (unsigned int out[32], const unsigned int a[32]) +{ + unsigned int i; + unsigned int j; + unsigned int u; + + for (i = 0; i < 32; ++i) { + u = 0; + for (j = 0; j < i - j; ++j) + u += a[j] * a[i - j]; + for (j = i + 1; j < i + 32 - j; ++j) + u += 38 * a[j] * a[i + 32 - j]; + u *= 2; + if ((i & 1) == 0) { + u += a[i / 2] * a[i / 2]; + u += 38 * a[i / 2 + 16] * a[i / 2 + 16]; + } + out[i] = u; + } + squeeze (out); +} + +static void select (unsigned int p[64], unsigned int q[64], + const unsigned int r[64], const unsigned int s[64], unsigned int b) +{ + unsigned int j; + unsigned int t; + unsigned int bminus1; + + bminus1 = b - 1; + for (j = 0; j < 64; ++j) { + t = bminus1 & (r[j] ^ s[j]); + p[j] = s[j] ^ t; + q[j] = r[j] ^ t; + } +} + +static void mainloop (unsigned int work[64], const unsigned char e[32]) +{ + unsigned int xzm1[64]; + unsigned int xzm[64]; + unsigned int xzmb[64]; + unsigned int xzm1b[64]; + unsigned int xznb[64]; + unsigned int xzn1b[64]; + unsigned int a0[64]; + unsigned int a1[64]; + unsigned int b0[64]; + unsigned int b1[64]; + unsigned int c1[64]; + unsigned int r[32]; + unsigned int s[32]; + unsigned int t[32]; + unsigned int u[32]; + unsigned int i; + unsigned int j; + unsigned int b; + int pos; + + for (j = 0; j < 32; ++j) + xzm1[j] = work[j]; + xzm1[32] = 1; + for (j = 33; j < 64; ++j) + xzm1[j] = 0; + + xzm[0] = 1; + for (j = 1; j < 64; ++j) + xzm[j] = 0; + + for (pos = 254; pos >= 0; --pos) { + b = e[pos / 8] >> (pos & 7); + b &= 1; + select (xzmb, xzm1b, xzm, xzm1, b); + add (a0, xzmb, xzmb + 32); + sub (a0 + 32, xzmb, xzmb + 32); + add (a1, xzm1b, xzm1b + 32); + sub (a1 + 32, xzm1b, xzm1b + 32); + square (b0, a0); + square (b0 + 32, a0 + 32); + mult (b1, a1, a0 + 32); + mult (b1 + 32, a1 + 32, a0); + add (c1, b1, b1 + 32); + sub (c1 + 32, b1, b1 + 32); + square (r, c1 + 32); + sub (s, b0, b0 + 32); + mult121665 (t, s); + add (u, t, b0); + mult (xznb, b0, b0 + 32); + mult (xznb + 32, s, u); + square (xzn1b, c1); + mult (xzn1b + 32, r, work); + select (xzm, xzm1, xznb, xzn1b, b); + } + + for (j = 0; j < 64; ++j) + work[j] = xzm[j]; +} + +static void recip (unsigned int out[32], const unsigned int z[32]) +{ + unsigned int z2[32]; + unsigned int z9[32]; + unsigned int z11[32]; + unsigned int z2_5_0[32]; + unsigned int z2_10_0[32]; + unsigned int z2_20_0[32]; + unsigned int z2_50_0[32]; + unsigned int z2_100_0[32]; + unsigned int t0[32]; + unsigned int t1[32]; + int i; + + /* 2 */square (z2, z); + /* 4 */square (t1, z2); + /* 8 */square (t0, t1); + /* 9 */mult (z9, t0, z); + /* 11 */mult (z11, z9, z2); + /* 22 */square (t0, z11); + /* 2^5 - 2^0 = 31 */mult (z2_5_0, t0, z9); + + /* 2^6 - 2^1 */square (t0, z2_5_0); + /* 2^7 - 2^2 */square (t1, t0); + /* 2^8 - 2^3 */square (t0, t1); + /* 2^9 - 2^4 */square (t1, t0); + /* 2^10 - 2^5 */square (t0, t1); + /* 2^10 - 2^0 */mult (z2_10_0, t0, z2_5_0); + + /* 2^11 - 2^1 */square (t0, z2_10_0); + /* 2^12 - 2^2 */square (t1, t0); + /* 2^20 - 2^10 */for (i = 2; i < 10; i += 2) { + square (t0, t1); + square (t1, t0); + } + /* 2^20 - 2^0 */mult (z2_20_0, t1, z2_10_0); + + /* 2^21 - 2^1 */square (t0, z2_20_0); + /* 2^22 - 2^2 */square (t1, t0); + /* 2^40 - 2^20 */for (i = 2; i < 20; i += 2) { + square (t0, t1); + square (t1, t0); + } + /* 2^40 - 2^0 */mult (t0, t1, z2_20_0); + + /* 2^41 - 2^1 */square (t1, t0); + /* 2^42 - 2^2 */square (t0, t1); + /* 2^50 - 2^10 */for (i = 2; i < 10; i += 2) { + square (t1, t0); + square (t0, t1); + } + /* 2^50 - 2^0 */mult (z2_50_0, t0, z2_10_0); + + /* 2^51 - 2^1 */square (t0, z2_50_0); + /* 2^52 - 2^2 */square (t1, t0); + /* 2^100 - 2^50 */for (i = 2; i < 50; i += 2) { + square (t0, t1); + square (t1, t0); + } + /* 2^100 - 2^0 */mult (z2_100_0, t1, z2_50_0); + + /* 2^101 - 2^1 */square (t1, z2_100_0); + /* 2^102 - 2^2 */square (t0, t1); + /* 2^200 - 2^100 */for (i = 2; i < 100; i += 2) { + square (t1, t0); + square (t0, t1); + } + /* 2^200 - 2^0 */mult (t1, t0, z2_100_0); + + /* 2^201 - 2^1 */square (t0, t1); + /* 2^202 - 2^2 */square (t1, t0); + /* 2^250 - 2^50 */for (i = 2; i < 50; i += 2) { + square (t0, t1); + square (t1, t0); + } + /* 2^250 - 2^0 */mult (t0, t1, z2_50_0); + + /* 2^251 - 2^1 */square (t1, t0); + /* 2^252 - 2^2 */square (t0, t1); + /* 2^253 - 2^3 */square (t1, t0); + /* 2^254 - 2^4 */square (t0, t1); + /* 2^255 - 2^5 */square (t1, t0); + /* 2^255 - 21 */mult (out, t1, z11); +} + +int curve25519 (unsigned char *q, const unsigned char *n, + const unsigned char *p) +{ + unsigned int work[96]; + unsigned char e[32]; + unsigned int i; + for (i = 0; i < 32; ++i) + e[i] = n[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + for (i = 0; i < 32; ++i) + work[i] = p[i]; + work[31] &= 127; + mainloop (work, e); + recip (work + 32, work + 32); + mult (work + 64, work, work + 32); + freeze (work + 64); + for (i = 0; i < 32; ++i) + q[i] = work[64 + i]; + return 0; +} diff --git a/src/libcryptobox/poly1305/README.md b/src/libcryptobox/poly1305/README.md new file mode 100644 index 000000000..510e0f781 --- /dev/null +++ b/src/libcryptobox/poly1305/README.md @@ -0,0 +1,112 @@ +"A state-of-the-art message-authentication code"
+
+# ABOUT
+
+See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)
+
+These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.
+Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)
+
+# BUILDING
+
+## Default
+
+If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based
+on what it can tell the compiler supports
+
+ gcc poly1305-donna.c -O3 -o poly1305.o
+
+## Selecting a specific version
+
+ gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS
+
+Where `-DPOLY1305_XXBITS` is one of
+
+ * `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions
+ * `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions
+ * `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions
+ * `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions
+
+8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due
+to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which
+is not the intention of this project.
+
+# USAGE
+
+See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:
+
+The poly1305_auth function, viewed as a function of the message for a uniform random key, is
+designed to meet the standard notion of unforgeability after a single message. After the sender
+authenticates one message, an attacker cannot find authenticators for any other messages.
+
+The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.
+Authenticators for two messages under the same key should be expected to reveal enough information
+to allow forgeries of authenticators on other messages.
+
+## Functions
+
+`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support
+every underlying platform specific implementation. It should be size_t aligned, which should be handled already
+with the size_t member `aligner`.
+
+`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`
+
+where
+
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**
+
+`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`
+
+where `m` is a pointer to the message fragment to be processed, and
+
+`bytes` is the length of the message fragment
+
+`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`
+
+where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying
+implementation will zero out `ctx`.
+
+`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`
+
+where `mac` is the buffer which receives the 16 byte authenticator,
+
+`m` is a pointer to the message to be processed,
+
+`bytes` is the number of bytes in the message, and
+
+`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.
+
+`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`
+
+where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not
+
+`int poly1305_power_on_self_test(void);`
+
+tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if
+any tests fail.
+
+## Example
+
+### Simple
+
+ #include "poly1305-donna.h"
+
+ unsigned char key[32] = {...}, mac[16];
+ unsigned char msg[] = {...};
+
+ poly1305_auth(mac, msg, msglen, key);
+
+### Full
+
+[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing
+the correct results, compute an authenticator, and test it against an expected value.
+
+# LICENSE
+
+[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN
+
+
+# NAMESAKE
+
+I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence
+the name.
\ No newline at end of file diff --git a/src/libcryptobox/poly1305/poly1305-donna-16.h b/src/libcryptobox/poly1305/poly1305-donna-16.h new file mode 100644 index 000000000..5e5c6d3b6 --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna-16.h @@ -0,0 +1,202 @@ +/* + poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition +*/ + +#if defined(_MSC_VER) + #define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define POLY1305_NOINLINE __attribute__((noinline)) +#else + #define POLY1305_NOINLINE +#endif + +#define poly1305_block_size 16 + +/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */ +typedef struct poly1305_state_internal_t { + unsigned char buffer[poly1305_block_size]; + size_t leftover; + unsigned short r[10]; + unsigned short h[10]; + unsigned short pad[8]; + unsigned char final; +} poly1305_state_internal_t; + +/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */ +static unsigned short +U8TO16(const unsigned char *p) { + return + (((unsigned short)(p[0] & 0xff) ) | + ((unsigned short)(p[1] & 0xff) << 8)); +} + +/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */ +static void +U16TO8(unsigned char *p, unsigned short v) { + p[0] = (v ) & 0xff; + p[1] = (v >> 8) & 0xff; +} + +void +poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + unsigned short t0,t1,t2,t3,t4,t5,t6,t7; + size_t i; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff; + t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff; + t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03; + t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff; + t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff; + st->r[5] = ((t4 >> 1) ) & 0x1ffe; + t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff; + t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81; + t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff; + st->r[9] = ((t7 >> 5) ) & 0x007f; + + /* h = 0 */ + for (i = 0; i < 10; i++) + st->h[i] = 0; + + /* save pad for later */ + for (i = 0; i < 8; i++) + st->pad[i] = U8TO16(&key[16 + (2 * i)]); + + st->leftover = 0; + st->final = 0; +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { + const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */ + unsigned short t0,t1,t2,t3,t4,t5,t6,t7; + unsigned long d[10]; + unsigned long c; + + while (bytes >= poly1305_block_size) { + size_t i, j; + + /* h += m[i] */ + t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff; + t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff; + t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff; + t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff; + t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff; + st->h[5] += ((t4 >> 1) ) & 0x1fff; + t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff; + t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff; + t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff; + st->h[9] += ((t7 >> 5) ) | hibit; + + /* h *= r, (partial) h %= p */ + for (i = 0, c = 0; i < 10; i++) { + d[i] = c; + for (j = 0; j < 10; j++) { + d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j])); + /* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */ + if (j == 4) { + c = (d[i] >> 13); + d[i] &= 0x1fff; + } + } + c += (d[i] >> 13); + d[i] &= 0x1fff; + } + c = ((c << 2) + c); /* c *= 5 */ + c += d[0]; + d[0] = ((unsigned short)c & 0x1fff); + c = (c >> 13); + d[1] += c; + + for (i = 0; i < 10; i++) + st->h[i] = (unsigned short)d[i]; + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } +} + +POLY1305_NOINLINE void +poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + unsigned short c; + unsigned short g[10]; + unsigned short mask; + unsigned long f; + size_t i; + + /* process the remaining block */ + if (st->leftover) { + size_t i = st->leftover; + st->buffer[i++] = 1; + for (; i < poly1305_block_size; i++) + st->buffer[i] = 0; + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully carry h */ + c = st->h[1] >> 13; + st->h[1] &= 0x1fff; + for (i = 2; i < 10; i++) { + st->h[i] += c; + c = st->h[i] >> 13; + st->h[i] &= 0x1fff; + } + st->h[0] += (c * 5); + c = st->h[0] >> 13; + st->h[0] &= 0x1fff; + st->h[1] += c; + c = st->h[1] >> 13; + st->h[1] &= 0x1fff; + st->h[2] += c; + + /* compute h + -p */ + g[0] = st->h[0] + 5; + c = g[0] >> 13; + g[0] &= 0x1fff; + for (i = 1; i < 10; i++) { + g[i] = st->h[i] + c; + c = g[i] >> 13; + g[i] &= 0x1fff; + } + g[9] -= (1 << 13); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1; + for (i = 0; i < 10; i++) + g[i] &= mask; + mask = ~mask; + for (i = 0; i < 10; i++) + st->h[i] = (st->h[i] & mask) | g[i]; + + /* h = h % (2^128) */ + st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff; + st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff; + st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff; + st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff; + st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff; + st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff; + st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff; + st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff; + + /* mac = (h + pad) % (2^128) */ + f = (unsigned long)st->h[0] + st->pad[0]; + st->h[0] = (unsigned short)f; + for (i = 1; i < 8; i++) { + f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16); + st->h[i] = (unsigned short)f; + } + + for (i = 0; i < 8; i++) + U16TO8(mac + (i * 2), st->h[i]); + + /* zero out the state */ + for (i = 0; i < 10; i++) + st->h[i] = 0; + for (i = 0; i < 10; i++) + st->r[i] = 0; + for (i = 0; i < 8; i++) + st->pad[i] = 0; +} diff --git a/src/libcryptobox/poly1305/poly1305-donna-32.h b/src/libcryptobox/poly1305/poly1305-donna-32.h new file mode 100644 index 000000000..c45aab33a --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna-32.h @@ -0,0 +1,219 @@ +/* + poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition +*/ + +#if defined(_MSC_VER) + #define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define POLY1305_NOINLINE __attribute__((noinline)) +#else + #define POLY1305_NOINLINE +#endif + +#define poly1305_block_size 16 + +/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */ +typedef struct poly1305_state_internal_t { + unsigned long r[5]; + unsigned long h[5]; + unsigned long pad[4]; + size_t leftover; + unsigned char buffer[poly1305_block_size]; + unsigned char final; +} poly1305_state_internal_t; + +/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ +static unsigned long +U8TO32(const unsigned char *p) { + return + (((unsigned long)(p[0] & 0xff) ) | + ((unsigned long)(p[1] & 0xff) << 8) | + ((unsigned long)(p[2] & 0xff) << 16) | + ((unsigned long)(p[3] & 0xff) << 24)); +} + +/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ +static void +U32TO8(unsigned char *p, unsigned long v) { + p[0] = (v ) & 0xff; + p[1] = (v >> 8) & 0xff; + p[2] = (v >> 16) & 0xff; + p[3] = (v >> 24) & 0xff; +} + +void +poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff; + st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03; + st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff; + st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff; + st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* save pad for later */ + st->pad[0] = U8TO32(&key[16]); + st->pad[1] = U8TO32(&key[20]); + st->pad[2] = U8TO32(&key[24]); + st->pad[3] = U8TO32(&key[28]); + + st->leftover = 0; + st->final = 0; +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { + const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ + unsigned long r0,r1,r2,r3,r4; + unsigned long s1,s2,s3,s4; + unsigned long h0,h1,h2,h3,h4; + unsigned long long d0,d1,d2,d3,d4; + unsigned long c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (bytes >= poly1305_block_size) { + /* h += m[i] */ + h0 += (U8TO32(m+ 0) ) & 0x3ffffff; + h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff; + h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff; + h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff; + h4 += (U8TO32(m+12) >> 8) | hibit; + + /* h *= r */ + d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1); + d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2); + d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3); + d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4); + d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0); + + /* (partial) h %= p */ + c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff; + d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff; + d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff; + d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff; + d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff; + h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; + h1 += c; + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +} + +POLY1305_NOINLINE void +poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + unsigned long h0,h1,h2,h3,h4,c; + unsigned long g0,g1,g2,g3,g4; + unsigned long long f; + unsigned long mask; + + /* process the remaining block */ + if (st->leftover) { + size_t i = st->leftover; + st->buffer[i++] = 1; + for (; i < poly1305_block_size; i++) + st->buffer[i] = 0; + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; h1 = h1 & 0x3ffffff; + h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; + h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; + h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; + h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; + g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; + g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; + g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; + g4 = h4 + c - (1 << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + pad) % (2^128) */ + f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f; + f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f; + f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f; + f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f; + + U32TO8(mac + 0, h0); + U32TO8(mac + 4, h1); + U32TO8(mac + 8, h2); + U32TO8(mac + 12, h3); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->r[3] = 0; + st->r[4] = 0; + st->pad[0] = 0; + st->pad[1] = 0; + st->pad[2] = 0; + st->pad[3] = 0; +} + diff --git a/src/libcryptobox/poly1305/poly1305-donna-64.h b/src/libcryptobox/poly1305/poly1305-donna-64.h new file mode 100644 index 000000000..016f5b384 --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna-64.h @@ -0,0 +1,224 @@ +/* + poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition +*/ + +#if defined(_MSC_VER) + #include <intrin.h> + + typedef struct uint128_t { + unsigned long long lo; + unsigned long long hi; + } uint128_t; + + #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi) + #define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; } + #define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); } + #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift))) + #define LO(in) (in.lo) + + #define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #if defined(__SIZEOF_INT128__) + typedef unsigned __int128 uint128_t; + #else + typedef unsigned uint128_t __attribute__((mode(TI))); + #endif + + #define MUL(out, x, y) out = ((uint128_t)x * y) + #define ADD(out, in) out += in + #define ADDLO(out, in) out += in + #define SHR(in, shift) (unsigned long long)(in >> (shift)) + #define LO(in) (unsigned long long)(in) + + #define POLY1305_NOINLINE __attribute__((noinline)) +#endif + +#define poly1305_block_size 16 + +/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */ +typedef struct poly1305_state_internal_t { + unsigned long long r[3]; + unsigned long long h[3]; + unsigned long long pad[2]; + size_t leftover; + unsigned char buffer[poly1305_block_size]; + unsigned char final; +} poly1305_state_internal_t; + +/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ +static unsigned long long +U8TO64(const unsigned char *p) { + return + (((unsigned long long)(p[0] & 0xff) ) | + ((unsigned long long)(p[1] & 0xff) << 8) | + ((unsigned long long)(p[2] & 0xff) << 16) | + ((unsigned long long)(p[3] & 0xff) << 24) | + ((unsigned long long)(p[4] & 0xff) << 32) | + ((unsigned long long)(p[5] & 0xff) << 40) | + ((unsigned long long)(p[6] & 0xff) << 48) | + ((unsigned long long)(p[7] & 0xff) << 56)); +} + +/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ +static void +U64TO8(unsigned char *p, unsigned long long v) { + p[0] = (v ) & 0xff; + p[1] = (v >> 8) & 0xff; + p[2] = (v >> 16) & 0xff; + p[3] = (v >> 24) & 0xff; + p[4] = (v >> 32) & 0xff; + p[5] = (v >> 40) & 0xff; + p[6] = (v >> 48) & 0xff; + p[7] = (v >> 56) & 0xff; +} + +void +poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + unsigned long long t0,t1; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = U8TO64(&key[0]); + t1 = U8TO64(&key[8]); + + st->r[0] = ( t0 ) & 0xffc0fffffff; + st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; + st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + /* save pad for later */ + st->pad[0] = U8TO64(&key[16]); + st->pad[1] = U8TO64(&key[24]); + + st->leftover = 0; + st->final = 0; +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { + const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */ + unsigned long long r0,r1,r2; + unsigned long long s1,s2; + unsigned long long h0,h1,h2; + unsigned long long c; + uint128_t d0,d1,d2,d; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + while (bytes >= poly1305_block_size) { + unsigned long long t0,t1; + + /* h += m[i] */ + t0 = U8TO64(&m[0]); + t1 = U8TO64(&m[8]); + + h0 += (( t0 ) & 0xfffffffffff); + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); + h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; + + /* h *= r */ + MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d); + MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d); + MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d); + + /* (partial) h %= p */ + c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff; + ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff; + ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; + h1 += c; + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + + +POLY1305_NOINLINE void +poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + unsigned long long h0,h1,h2,c; + unsigned long long g0,g1,g2; + unsigned long long t0,t1; + + /* process the remaining block */ + if (st->leftover) { + size_t i = st->leftover; + st->buffer[i] = 1; + for (i = i + 1; i < poly1305_block_size; i++) + st->buffer[i] = 0; + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; + g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; + g2 = h2 + c - ((unsigned long long)1 << 42); + + /* select h if h < p, or h + -p if h >= p */ + c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1; + g0 &= c; + g1 &= c; + g2 &= c; + c = ~c; + h0 = (h0 & c) | g0; + h1 = (h1 & c) | g1; + h2 = (h2 & c) | g2; + + /* h = (h + pad) */ + t0 = st->pad[0]; + t1 = st->pad[1]; + + h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; + + /* mac = h % (2^128) */ + h0 = ((h0 ) | (h1 << 44)); + h1 = ((h1 >> 20) | (h2 << 24)); + + U64TO8(&mac[0], h0); + U64TO8(&mac[8], h1); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->pad[0] = 0; + st->pad[1] = 0; +} + diff --git a/src/libcryptobox/poly1305/poly1305-donna-8.h b/src/libcryptobox/poly1305/poly1305-donna-8.h new file mode 100644 index 000000000..ac5d5aeb5 --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna-8.h @@ -0,0 +1,186 @@ +/* + poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition + + based on the public domain reference version in supercop by djb +*/ + +#if defined(_MSC_VER) + #define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #define POLY1305_NOINLINE __attribute__((noinline)) +#else + #define POLY1305_NOINLINE +#endif + +#define poly1305_block_size 16 + +/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */ +typedef struct poly1305_state_internal_t { + unsigned char buffer[poly1305_block_size]; + size_t leftover; + unsigned char h[17]; + unsigned char r[17]; + unsigned char pad[17]; + unsigned char final; +} poly1305_state_internal_t; + +void +poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + size_t i; + + st->leftover = 0; + + /* h = 0 */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[ 0] = key[ 0] & 0xff; + st->r[ 1] = key[ 1] & 0xff; + st->r[ 2] = key[ 2] & 0xff; + st->r[ 3] = key[ 3] & 0x0f; + st->r[ 4] = key[ 4] & 0xfc; + st->r[ 5] = key[ 5] & 0xff; + st->r[ 6] = key[ 6] & 0xff; + st->r[ 7] = key[ 7] & 0x0f; + st->r[ 8] = key[ 8] & 0xfc; + st->r[ 9] = key[ 9] & 0xff; + st->r[10] = key[10] & 0xff; + st->r[11] = key[11] & 0x0f; + st->r[12] = key[12] & 0xfc; + st->r[13] = key[13] & 0xff; + st->r[14] = key[14] & 0xff; + st->r[15] = key[15] & 0x0f; + st->r[16] = 0; + + /* save pad for later */ + for (i = 0; i < 16; i++) + st->pad[i] = key[i + 16]; + st->pad[16] = 0; + + st->final = 0; +} + +static void +poly1305_add(unsigned char h[17], const unsigned char c[17]) { + unsigned short u; + unsigned int i; + for (u = 0, i = 0; i < 17; i++) { + u += (unsigned short)h[i] + (unsigned short)c[i]; + h[i] = (unsigned char)u & 0xff; + u >>= 8; + } +} + +static void +poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) { + unsigned long u; + unsigned int i; + u = 0; + for (i = 0; i < 16; i++) { + u += hr[i]; + h[i] = (unsigned char)u & 0xff; + u >>= 8; + } + u += hr[16]; + h[16] = (unsigned char)u & 0x03; + u >>= 2; + u += (u << 2); /* u *= 5; */ + for (i = 0; i < 16; i++) { + u += h[i]; + h[i] = (unsigned char)u & 0xff; + u >>= 8; + } + h[16] += (unsigned char)u; +} + +static void +poly1305_freeze(unsigned char h[17]) { + static const unsigned char minusp[17] = { + 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0xfc + }; + unsigned char horig[17], negative; + unsigned int i; + + /* compute h + -p */ + for (i = 0; i < 17; i++) + horig[i] = h[i]; + poly1305_add(h, minusp); + + /* select h if h < p, or h + -p if h >= p */ + negative = -(h[16] >> 7); + for (i = 0; i < 17; i++) + h[i] ^= negative & (horig[i] ^ h[i]); +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { + const unsigned char hibit = st->final ^ 1; /* 1 << 128 */ + + while (bytes >= poly1305_block_size) { + unsigned long hr[17], u; + unsigned char c[17]; + unsigned int i, j; + + /* h += m */ + for (i = 0; i < 16; i++) + c[i] = m[i]; + c[16] = hibit; + poly1305_add(st->h, c); + + /* h *= r */ + for (i = 0; i < 17; i++) { + u = 0; + for (j = 0; j <= i ; j++) { + u += (unsigned short)st->h[j] * st->r[i - j]; + } + for (j = i + 1; j < 17; j++) { + unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j]; + v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ + u += v; + } + hr[i] = u; + } + + /* (partial) h %= p */ + poly1305_squeeze(st->h, hr); + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } +} + +POLY1305_NOINLINE void +poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + size_t i; + + /* process the remaining block */ + if (st->leftover) { + size_t i = st->leftover; + st->buffer[i++] = 1; + for (; i < poly1305_block_size; i++) + st->buffer[i] = 0; + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully reduce h */ + poly1305_freeze(st->h); + + /* h = (h + pad) % (1 << 128) */ + poly1305_add(st->h, st->pad); + for (i = 0; i < 16; i++) + mac[i] = st->h[i]; + + /* zero out the state */ + for (i = 0; i < 17; i++) + st->h[i] = 0; + for (i = 0; i < 17; i++) + st->r[i] = 0; + for (i = 0; i < 17; i++) + st->pad[i] = 0; +} diff --git a/src/libcryptobox/poly1305/poly1305-donna.c b/src/libcryptobox/poly1305/poly1305-donna.c new file mode 100644 index 000000000..c1e3c74b5 --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna.c @@ -0,0 +1,201 @@ +#include "poly1305-donna.h" + +#if defined(POLY1305_8BIT) +#include "poly1305-donna-8.h" +#elif defined(POLY1305_16BIT) +#include "poly1305-donna-16.h" +#elif defined(POLY1305_32BIT) +#include "poly1305-donna-32.h" +#elif defined(POLY1305_64BIT) +#include "poly1305-donna-64.h" +#else + +/* auto detect between 32bit / 64bit */ +#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__)) +#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64)) +#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)))) + +#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT) +#include "poly1305-donna-64.h" +#else +#include "poly1305-donna-32.h" +#endif + +#endif + +void +poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) { + poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; + size_t i; + + /* handle leftover */ + if (st->leftover) { + size_t want = (poly1305_block_size - st->leftover); + if (want > bytes) + want = bytes; + for (i = 0; i < want; i++) + st->buffer[st->leftover + i] = m[i]; + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < poly1305_block_size) + return; + poly1305_blocks(st, st->buffer, poly1305_block_size); + st->leftover = 0; + } + + /* process full blocks */ + if (bytes >= poly1305_block_size) { + size_t want = (bytes & ~(poly1305_block_size - 1)); + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + /* store leftover */ + if (bytes) { + for (i = 0; i < bytes; i++) + st->buffer[st->leftover + i] = m[i]; + st->leftover += bytes; + } +} + +void +poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) { + poly1305_context ctx; + poly1305_init(&ctx, key); + poly1305_update(&ctx, m, bytes); + poly1305_finish(&ctx, mac); +} + +int +poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) { + size_t i; + unsigned int dif = 0; + for (i = 0; i < 16; i++) + dif |= (mac1[i] ^ mac2[i]); + dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); + return (dif & 1); +} + + +/* test a few basic operations */ +int +poly1305_power_on_self_test(void) { + /* example from nacl */ + static const unsigned char nacl_key[32] = { + 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91, + 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25, + 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65, + 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80, + }; + + static const unsigned char nacl_msg[131] = { + 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73, + 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce, + 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4, + 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a, + 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b, + 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72, + 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2, + 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38, + 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a, + 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae, + 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea, + 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda, + 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde, + 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3, + 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6, + 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74, + 0xe3,0x55,0xa5 + }; + + static const unsigned char nacl_mac[16] = { + 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5, + 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9 + }; + + /* generates a final value of (2^130 - 2) == 3 */ + static const unsigned char wrap_key[32] = { + 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + }; + + static const unsigned char wrap_msg[16] = { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + }; + + static const unsigned char wrap_mac[16] = { + 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + }; + + /* + mac of the macs of messages of length 0 to 256, where the key and messages + have all their values set to the length + */ + static const unsigned char total_key[32] = { + 0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff + }; + + static const unsigned char total_mac[16] = { + 0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd, + 0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39 + }; + + poly1305_context ctx; + poly1305_context total_ctx; + unsigned char all_key[32]; + unsigned char all_msg[256]; + unsigned char mac[16]; + size_t i, j; + int result = 1; + + for (i = 0; i < sizeof(mac); i++) + mac[i] = 0; + poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key); + result &= poly1305_verify(nacl_mac, mac); + + for (i = 0; i < sizeof(mac); i++) + mac[i] = 0; + poly1305_init(&ctx, nacl_key); + poly1305_update(&ctx, nacl_msg + 0, 32); + poly1305_update(&ctx, nacl_msg + 32, 64); + poly1305_update(&ctx, nacl_msg + 96, 16); + poly1305_update(&ctx, nacl_msg + 112, 8); + poly1305_update(&ctx, nacl_msg + 120, 4); + poly1305_update(&ctx, nacl_msg + 124, 2); + poly1305_update(&ctx, nacl_msg + 126, 1); + poly1305_update(&ctx, nacl_msg + 127, 1); + poly1305_update(&ctx, nacl_msg + 128, 1); + poly1305_update(&ctx, nacl_msg + 129, 1); + poly1305_update(&ctx, nacl_msg + 130, 1); + poly1305_finish(&ctx, mac); + result &= poly1305_verify(nacl_mac, mac); + + for (i = 0; i < sizeof(mac); i++) + mac[i] = 0; + poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key); + result &= poly1305_verify(wrap_mac, mac); + + poly1305_init(&total_ctx, total_key); + for (i = 0; i < 256; i++) { + /* set key and message to 'i,i,i..' */ + for (j = 0; j < sizeof(all_key); j++) + all_key[j] = i; + for (j = 0; j < i; j++) + all_msg[j] = i; + poly1305_auth(mac, all_msg, i, all_key); + poly1305_update(&total_ctx, mac, 16); + } + poly1305_finish(&total_ctx, mac); + result &= poly1305_verify(total_mac, mac); + + return result; +} diff --git a/src/libcryptobox/poly1305/poly1305-donna.h b/src/libcryptobox/poly1305/poly1305-donna.h new file mode 100644 index 000000000..94e23533f --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305-donna.h @@ -0,0 +1,20 @@ +#ifndef POLY1305_DONNA_H +#define POLY1305_DONNA_H + +#include <stddef.h> + +typedef struct poly1305_context { + size_t aligner; + unsigned char opaque[136]; +} poly1305_context; + +void poly1305_init(poly1305_context *ctx, const unsigned char key[32]); +void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes); +void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]); +void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]); + +int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); +int poly1305_power_on_self_test(void); + +#endif /* POLY1305_DONNA_H */ + |