Browse Source

Use optimized version of poly1305.

tags/0.9.0
Vsevolod Stakhov 9 years ago
parent
commit
06a8ad2bae

+ 10
- 1
src/libcryptobox/CMakeLists.txt View File

@@ -4,7 +4,7 @@ INCLUDE(AsmOp.cmake)
TARGET_ARCHITECTURE(ARCH)

SET(CHACHASRC chacha20/chacha.c chacha20/ref.c)
SET(POLYSRC poly1305/poly1305-donna.c)
SET(POLYSRC poly1305/poly1305.c)

# For now we support only x86_64 architecture with optimizations
IF(${ARCH} STREQUAL "x86_64")
@@ -12,35 +12,44 @@ IF(${ARCH} STREQUAL "x86_64")
ASM_OP(HAVE_AVX2 "vpaddq %ymm0, %ymm0, %ymm0" "avx2")
ASM_OP(HAVE_AVX "vpaddq %xmm0, %xmm0, %xmm0" "avx")
ASM_OP(HAVE_SSE2 "pmuludq %xmm0, %xmm0" "sse2")
ASM_OP(HAVE_SLASHMACRO "
.macro TEST1 op
\\op %eax, %eax
.endm
TEST1 xorl
" "slash macro convention")
ASM_OP(HAVE_DOLLARMACRO "
.macro TEST1 op
$0 %eax, %eax
.endm
TEST1 xorl
" "dollar macro convention")
CONFIGURE_FILE(platform_config.h.in platform_config.h)
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
SET(CURVESRC curve25519/curve25519-donna-c64.c)
SET(POLYSRC ${POLYSRC} poly1305/ref-64.c)
ELSEIF(${ARCH} STREQUAL "i386")
SET(POLYSRC ${POLYSRC} poly1305/ref-32.c)
SET(CURVESRC curve25519/curve25519-donna.c)
ELSE()
SET(CURVESRC curve25519/ref.c)
SET(POLYSRC ${POLYSRC} poly1305/ref-32.c)
ENDIF()

IF(HAVE_AVX2)
SET(CHACHASRC ${CHACHASRC} chacha20/avx2.S)
SET(POLYSRC ${POLYSRC} poly1305/avx2.S)
ENDIF(HAVE_AVX2)
IF(HAVE_AVX)
SET(CHACHASRC ${CHACHASRC} chacha20/avx.S)
SET(POLYSRC ${POLYSRC} poly1305/avx.S)
ENDIF(HAVE_AVX)
IF(HAVE_SSE2)
SET(CHACHASRC ${CHACHASRC} chacha20/sse2.S)
SET(POLYSRC ${POLYSRC} poly1305/sse2.S)
ENDIF(HAVE_SSE2)

SET(LIBCRYPTOBOXSRC cryptobox.c)

+ 1
- 0
src/libcryptobox/chacha20/avx.S View File

@@ -1,4 +1,5 @@
#include "macro.S"
#include "constants.S"
SECTION_TEXT

GLOBAL_HIDDEN_FN chacha_blocks_avx

+ 1
- 0
src/libcryptobox/chacha20/avx2.S View File

@@ -1,4 +1,5 @@
#include "macro.S"
#include "constants.S"
SECTION_TEXT

GLOBAL_HIDDEN_FN chacha_blocks_avx2

+ 1
- 0
src/libcryptobox/chacha20/chacha.c View File

@@ -1,4 +1,5 @@
/* Copyright (c) 2015, Vsevolod Stakhov
* Copyright (c) 2015, Andrew Moon
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without

+ 6
- 0
src/libcryptobox/chacha20/constants.S View File

@@ -0,0 +1,6 @@
SECTION_RODATA
.p2align 4,,15
chacha_constants:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */

+ 0
- 7
src/libcryptobox/chacha20/macro.S View File

@@ -176,10 +176,3 @@
#endif
#endif
.endm

SECTION_RODATA
.p2align 4,,15
chacha_constants:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */

+ 1
- 1
src/libcryptobox/chacha20/sse2.S View File

@@ -1,5 +1,5 @@
#include "macro.S"
#include "constants.S"
SECTION_TEXT

GLOBAL_HIDDEN_FN chacha_blocks_sse2

+ 6
- 5
src/libcryptobox/cryptobox.c View File

@@ -24,7 +24,7 @@
#include "cryptobox.h"
#include "platform_config.h"
#include "chacha20/chacha.h"
#include "poly1305/poly1305-donna.h"
#include "poly1305/poly1305.h"
#include "curve25519/curve25519.h"
#include "ottery.h"

@@ -105,6 +105,7 @@ rspamd_cryptobox_init (void)
}

chacha_load ();
poly1305_load ();
}

void
@@ -139,7 +140,7 @@ void rspamd_cryptobox_encrypt_nm_inplace (guchar *data, gsize len,
const rspamd_nonce_t nonce,
const rspamd_nm_t nm, rspamd_sig_t sig)
{
poly1305_context mac_ctx;
poly1305_state mac_ctx;
guchar subkey[CHACHA_BLOCKBYTES];
chacha_state s;
gsize r;
@@ -151,7 +152,7 @@ void rspamd_cryptobox_encrypt_nm_inplace (guchar *data, gsize len,
r = chacha_update (&s, data, data, len);
chacha_final (&s, data + r);

poly1305_init (&mac_ctx, subkey);
poly1305_init (&mac_ctx, (const poly1305_key *)subkey);
poly1305_update (&mac_ctx, data, len);
poly1305_finish (&mac_ctx, sig);

@@ -163,7 +164,7 @@ gboolean
rspamd_cryptobox_decrypt_nm_inplace (guchar *data, gsize len,
const rspamd_nonce_t nonce, const rspamd_nm_t nm, const rspamd_sig_t sig)
{
poly1305_context mac_ctx;
poly1305_state mac_ctx;
guchar subkey[CHACHA_BLOCKBYTES];
rspamd_sig_t mac;
chacha_state s;
@@ -175,7 +176,7 @@ rspamd_cryptobox_decrypt_nm_inplace (guchar *data, gsize len,
memset (subkey, 0, sizeof (subkey));
chacha_update (&s, subkey, subkey, sizeof (subkey));

poly1305_init (&mac_ctx, subkey);
poly1305_init (&mac_ctx, (const poly1305_key *)subkey);
poly1305_update (&mac_ctx, data, len);
poly1305_finish (&mac_ctx, mac);


+ 0
- 112
src/libcryptobox/poly1305/README.md View File

@@ -1,112 +0,0 @@
"A state-of-the-art message-authentication code"
# ABOUT
See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)
These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.
Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)
# BUILDING
## Default
If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based
on what it can tell the compiler supports
gcc poly1305-donna.c -O3 -o poly1305.o
## Selecting a specific version
gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS
Where `-DPOLY1305_XXBITS` is one of
* `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions
* `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions
* `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions
* `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions
8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due
to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which
is not the intention of this project.
# USAGE
See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:
The poly1305_auth function, viewed as a function of the message for a uniform random key, is
designed to meet the standard notion of unforgeability after a single message. After the sender
authenticates one message, an attacker cannot find authenticators for any other messages.
The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.
Authenticators for two messages under the same key should be expected to reveal enough information
to allow forgeries of authenticators on other messages.
## Functions
`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support
every underlying platform specific implementation. It should be size_t aligned, which should be handled already
with the size_t member `aligner`.
`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`
where
`key` is the 32 byte key that is **only used for this message and is discarded immediately after**
`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`
where `m` is a pointer to the message fragment to be processed, and
`bytes` is the length of the message fragment
`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`
where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying
implementation will zero out `ctx`.
`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`
where `mac` is the buffer which receives the 16 byte authenticator,
`m` is a pointer to the message to be processed,
`bytes` is the number of bytes in the message, and
`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.
`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`
where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not
`int poly1305_power_on_self_test(void);`
tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if
any tests fail.
## Example
### Simple
#include "poly1305-donna.h"
unsigned char key[32] = {...}, mac[16];
unsigned char msg[] = {...};
poly1305_auth(mac, msg, msglen, key);
### Full
[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing
the correct results, compute an authenticator, and test it against an expected value.
# LICENSE
[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN
# NAMESAKE
I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence
the name.

+ 875
- 0
src/libcryptobox/poly1305/avx.S View File

@@ -0,0 +1,875 @@
#include "../chacha20/macro.S"
#include "constants.S"

SECTION_TEXT

GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0
movl $32, %eax
ret
FN_END poly1305_block_size_avx

GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1
poly1305_init_ext_avx_local:
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx
movq %rdi, %rbp
testq %rdx, %rdx
movq $-1, %rax
cmovne %rdx, %rax
movq %rax, -16(%rsp)
vpxor %xmm0, %xmm0, %xmm0
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm0, 16(%rdi)
vmovdqu %xmm0, 32(%rdi)
movq (%rsi), %r9
movq 8(%rsi), %r8
movabsq $17575274610687, %r10
andq %r9, %r10
shrq $44, %r9
movq %r8, %rax
salq $20, %rax
orq %rax, %r9
movabsq $17592181915647, %rax
andq %rax, %r9
shrq $24, %r8
movabsq $68719475727, %rax
andq %rax, %r8
leaq 40(%rdi), %r15
movl %r10d, %eax
andl $67108863, %eax
movl %eax, 40(%rdi)
movl %r9d, %edx
sall $18, %edx
movq %r10, %rax
shrq $26, %rax
orl %edx, %eax
andl $67108863, %eax
movl %eax, 44(%rdi)
movq %r9, %rax
shrq $8, %rax
andl $67108863, %eax
movl %eax, 48(%rdi)
movq %r9, %rax
shrq $34, %rax
movl %r8d, %edx
sall $10, %edx
orl %edx, %eax
andl $67108863, %eax
movl %eax, 52(%rdi)
movq %r8, %rax
shrq $16, %rax
movl %eax, 56(%rdi)
movq 16(%rsi), %rax
movq %rax, 104(%rdi)
movq 24(%rsi), %rax
movq %rax, 112(%rdi)
movl $0, %ebx
.L7:
testq %rbx, %rbx
jne .L4
leaq 60(%rbp), %r15
cmpq $16, -16(%rsp)
ja .L6
jmp .L5
.L4:
cmpq $1, %rbx
jne .L6
leaq 80(%rbp), %r15
cmpq $95, -16(%rsp)
jbe .L5
.L6:
leaq (%r8,%r8,4), %rsi
salq $2, %rsi
leaq (%r9,%r9), %rdi
movq %rdi, %rax
mulq %rsi
movq %rax, %r13
movq %rdx, %r14
movq %r10, %rax
mulq %r10
addq %r13, %rax
adcq %r14, %rdx
movabsq $17592186044415, %rcx
movq %rax, -72(%rsp)
movq %rdx, -64(%rsp)
andq -72(%rsp), %rcx
leaq (%r10,%r10), %r11
movq %r11, %rax
mulq %r9
movq %rax, %r11
movq %rdx, %r12
movq %rsi, %rax
mulq %r8
movq %rax, %r13
movq %rdx, %r14
addq %r11, %r13
adcq %r12, %r14
movq -72(%rsp), %rax
movq -64(%rsp), %rdx
shrdq $44, %rdx, %rax
movq %rax, -56(%rsp)
movq $0, -48(%rsp)
addq -56(%rsp), %r13
adcq -48(%rsp), %r14
movabsq $17592186044415, %rsi
andq %r13, %rsi
leaq (%r8,%r8), %rdi
movq %rdi, %rax
mulq %r10
movq %rax, %r11
movq %rdx, %r12
movq %r9, %rax
mulq %r9
addq %r11, %rax
adcq %r12, %rdx
shrdq $44, %r14, %r13
movq %r13, -40(%rsp)
movq $0, -32(%rsp)
addq -40(%rsp), %rax
adcq -32(%rsp), %rdx
movabsq $4398046511103, %rdi
andq %rax, %rdi
shrdq $42, %rdx, %rax
leaq (%rax,%rax,4), %r8
addq %rcx, %r8
movabsq $17592186044415, %r10
andq %r8, %r10
shrq $44, %r8
addq %rsi, %r8
movabsq $17592186044415, %r9
andq %r8, %r9
shrq $44, %r8
addq %rdi, %r8
movl %r10d, %eax
andl $67108863, %eax
movl %eax, (%r15)
movl %r9d, %edx
sall $18, %edx
movq %r10, %rax
shrq $26, %rax
orl %edx, %eax
andl $67108863, %eax
movl %eax, 4(%r15)
movq %r9, %rax
shrq $8, %rax
andl $67108863, %eax
movl %eax, 8(%r15)
movl %r8d, %edx
sall $10, %edx
movq %r9, %rax
shrq $34, %rax
orl %edx, %eax
andl $67108863, %eax
movl %eax, 12(%r15)
movq %r8, %rax
shrq $16, %rax
movl %eax, 16(%r15)
addq $1, %rbx
cmpq $2, %rbx
jne .L7
.L5:
movq $0, 120(%rbp)
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
FN_END poly1305_init_ext_avx



GLOBAL_HIDDEN_FN poly1305_blocks_avx
poly1305_blocks_avx_local:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
andq $-64, %rsp
subq $200, %rsp
movl $(1 << 24), %eax
movl $((1 << 26) - 1), %r8d
movl $(5), %r9d
vmovd %eax, %xmm1
vmovd %r8d, %xmm0
vmovd %r9d, %xmm2
vpshufd $68, %xmm1, %xmm1
vpshufd $68, %xmm0, %xmm0
vpshufd $68, %xmm2, %xmm2
vmovdqa %xmm1, 152(%rsp)
vmovdqa %xmm2, 184(%rsp)
movq 120(%rdi), %rax
testb $4, %al
je .L12
vpsrldq $8, %xmm1, %xmm1
vmovdqa %xmm1, 152(%rsp)
.L12:
testb $8, %al
je .L13
vpxor %xmm1, %xmm1, %xmm1
vmovdqa %xmm1, 152(%rsp)
.L13:
testb $1, %al
jne .L14
vmovq (%rsi), %xmm1
vpinsrq $1, 16(%rsi), %xmm1, %xmm1
vmovq 8(%rsi), %xmm3
vpinsrq $1, 24(%rsi), %xmm3, %xmm2
vpand %xmm0, %xmm1, %xmm7
vpsrlq $26, %xmm1, %xmm12
vpand %xmm0, %xmm12, %xmm12
vpsllq $12, %xmm2, %xmm3
vpsrlq $52, %xmm1, %xmm1
vpor %xmm3, %xmm1, %xmm6
vpand %xmm0, %xmm6, %xmm3
vpsrlq $26, %xmm6, %xmm6
vpand %xmm0, %xmm6, %xmm6
vpsrlq $40, %xmm2, %xmm2
vpor 152(%rsp), %xmm2, %xmm2
addq $32, %rsi
subq $32, %rdx
orq $1, %rax
movq %rax, 120(%rdi)
jmp .L15
.L14:
vmovdqu (%rdi), %xmm12
vmovdqu 16(%rdi), %xmm6
vmovdqu 32(%rdi), %xmm2
vpshufd $80, %xmm12, %xmm7
vpshufd $250, %xmm12, %xmm12
vpshufd $80, %xmm6, %xmm3
vpshufd $250, %xmm6, %xmm6
vpshufd $80, %xmm2, %xmm2
.L15:
movq 120(%rdi), %rax
testb $48, %al
je .L16
testb $16, %al
je .L17
vmovdqu 40(%rdi), %xmm1
vmovd 56(%rdi), %xmm4
vmovdqu 60(%rdi), %xmm5
vpunpckldq %xmm1, %xmm5, %xmm11
vpunpckhdq %xmm1, %xmm5, %xmm5
vmovd 76(%rdi), %xmm1
vpunpcklqdq %xmm4, %xmm1, %xmm4
jmp .L18
.L17:
movl $(1), %r8d
vmovdqu 40(%rdi), %xmm5
vmovd 56(%rdi), %xmm4
vmovd %r8d, %xmm1
vpunpckldq %xmm1, %xmm5, %xmm11
vpunpckhdq %xmm1, %xmm5, %xmm5
.L18:
vpshufd $80, %xmm11, %xmm1
vpshufd $250, %xmm11, %xmm11
vpshufd $80, %xmm5, %xmm10
vpshufd $250, %xmm5, %xmm5
jmp .L19
.L16:
vmovdqu 60(%rdi), %xmm5
vpshufd $0, %xmm5, %xmm1
vpshufd $85, %xmm5, %xmm11
vpshufd $170, %xmm5, %xmm10
vpshufd $255, %xmm5, %xmm5
vmovd 76(%rdi), %xmm4
vpshufd $0, %xmm4, %xmm4
.L19:
vmovdqa %xmm11, 136(%rsp)
vpmuludq 184(%rsp), %xmm11, %xmm13
vmovdqa %xmm13, 120(%rsp)
vmovdqa %xmm10, 104(%rsp)
vpmuludq 184(%rsp), %xmm10, %xmm13
vmovdqa %xmm13, 88(%rsp)
vmovdqa %xmm5, 72(%rsp)
vpmuludq 184(%rsp), %xmm5, %xmm5
vmovdqa %xmm5, 56(%rsp)
vmovdqa %xmm4, 40(%rsp)
vpmuludq 184(%rsp), %xmm4, %xmm4
vmovdqa %xmm4, 24(%rsp)
cmpq $63, %rdx
jbe .L20
vmovdqu 80(%rdi), %xmm4
vpshufd $0, %xmm4, %xmm5
vmovdqa %xmm5, 8(%rsp)
vpshufd $85, %xmm4, %xmm5
vmovdqa %xmm5, -8(%rsp)
vpshufd $170, %xmm4, %xmm13
vmovdqa %xmm13, -24(%rsp)
vpshufd $255, %xmm4, %xmm4
vmovdqa %xmm4, %xmm10
vmovdqa %xmm4, -40(%rsp)
vmovd 96(%rdi), %xmm4
vpshufd $0, %xmm4, %xmm4
vmovdqa %xmm4, %xmm8
vmovdqa %xmm4, -56(%rsp)
vpmuludq 184(%rsp), %xmm5, %xmm4
vmovdqa %xmm4, -72(%rsp)
vpmuludq 184(%rsp), %xmm13, %xmm4
vmovdqa %xmm4, -88(%rsp)
vpmuludq 184(%rsp), %xmm10, %xmm4
vmovdqa %xmm4, -104(%rsp)
vpmuludq 184(%rsp), %xmm8, %xmm4
vmovdqa %xmm4, -120(%rsp)
leaq 32(%rsi), %rax
movq %rdx, %rcx
vmovdqa %xmm1, 168(%rsp)
jmp .L22
.p2align 6
nop
nop
nop
nop
.L22:
vpmuludq -72(%rsp), %xmm2, %xmm13
vmovdqa -88(%rsp), %xmm5
vpmuludq %xmm5, %xmm6, %xmm4
vpmuludq %xmm5, %xmm2, %xmm11
vmovdqa -104(%rsp), %xmm9
vpmuludq %xmm9, %xmm6, %xmm5
vpmuludq %xmm9, %xmm2, %xmm10
vpaddq %xmm4, %xmm13, %xmm13
vpmuludq %xmm9, %xmm3, %xmm4
vmovdqa -120(%rsp), %xmm8
vpmuludq %xmm8, %xmm2, %xmm9
vpaddq %xmm5, %xmm11, %xmm11
vmovdqa %xmm8, %xmm5
vpmuludq %xmm8, %xmm12, %xmm8
vpmuludq %xmm5, %xmm3, %xmm14
vpaddq %xmm4, %xmm13, %xmm13
vpmuludq %xmm5, %xmm6, %xmm4
vmovdqa 8(%rsp), %xmm15
vpmuludq %xmm15, %xmm6, %xmm5
vpaddq %xmm8, %xmm13, %xmm13
vpmuludq %xmm15, %xmm2, %xmm8
vpaddq %xmm14, %xmm11, %xmm11
vpmuludq %xmm15, %xmm7, %xmm14
vpaddq %xmm4, %xmm10, %xmm10
vpmuludq %xmm15, %xmm12, %xmm4
vpaddq %xmm5, %xmm9, %xmm9
vpmuludq %xmm15, %xmm3, %xmm5
vmovdqa -8(%rsp), %xmm15
vpmuludq %xmm15, %xmm3, %xmm2
vpaddq %xmm14, %xmm13, %xmm13
vpmuludq %xmm15, %xmm6, %xmm6
vpaddq %xmm4, %xmm11, %xmm11
vpmuludq %xmm15, %xmm7, %xmm4
vpaddq %xmm5, %xmm10, %xmm10
vmovq -32(%rax), %xmm5
vpinsrq $1, -16(%rax), %xmm5, %xmm5
vpmuludq %xmm15, %xmm12, %xmm14
vpaddq %xmm2, %xmm9, %xmm9
vmovdqa -24(%rsp), %xmm2
vpmuludq %xmm2, %xmm12, %xmm15
vpaddq %xmm6, %xmm8, %xmm8
vpmuludq %xmm2, %xmm3, %xmm3
vpaddq %xmm4, %xmm11, %xmm11
vmovq -24(%rax), %xmm4
vpinsrq $1, -8(%rax), %xmm4, %xmm6
vpmuludq %xmm2, %xmm7, %xmm4
vpaddq %xmm14, %xmm10, %xmm10
vmovdqa -40(%rsp), %xmm1
vpmuludq %xmm1, %xmm7, %xmm14
vpaddq %xmm15, %xmm9, %xmm9
vpand %xmm5, %xmm0, %xmm2
vpmuludq %xmm1, %xmm12, %xmm12
vpaddq %xmm3, %xmm8, %xmm8
vpsrlq $26, %xmm5, %xmm3
vpand %xmm3, %xmm0, %xmm3
vpmuludq -56(%rsp), %xmm7, %xmm7
vpaddq %xmm4, %xmm10, %xmm10
vpsllq $12, %xmm6, %xmm15
vpsrlq $52, %xmm5, %xmm4
vpor %xmm15, %xmm4, %xmm4
vpaddq %xmm14, %xmm9, %xmm9
vpsrlq $14, %xmm6, %xmm5
vpand %xmm5, %xmm0, %xmm5
vpaddq %xmm12, %xmm8, %xmm8
vpand %xmm4, %xmm0, %xmm4
vpaddq %xmm7, %xmm8, %xmm8
vpsrlq $40, %xmm6, %xmm6
vpor 152(%rsp), %xmm6, %xmm6
vmovdqu (%rax), %xmm12
vmovdqu 16(%rax), %xmm7
vpunpckldq %xmm7, %xmm12, %xmm15
vpunpckhdq %xmm7, %xmm12, %xmm7
vpxor %xmm14, %xmm14, %xmm14
vpunpckldq %xmm14, %xmm15, %xmm12
vpunpckhdq %xmm14, %xmm15, %xmm15
vpunpckldq %xmm14, %xmm7, %xmm14
vpxor %xmm1, %xmm1, %xmm1
vpunpckhdq %xmm1, %xmm7, %xmm7
vpsllq $6, %xmm15, %xmm15
vpsllq $12, %xmm14, %xmm14
vpsllq $18, %xmm7, %xmm7
vpaddq %xmm12, %xmm13, %xmm12
vpaddq %xmm15, %xmm11, %xmm15
vpaddq %xmm14, %xmm10, %xmm14
vpaddq %xmm7, %xmm9, %xmm7
vpaddq 152(%rsp), %xmm8, %xmm8
vpmuludq 120(%rsp), %xmm6, %xmm13
vmovdqa 88(%rsp), %xmm10
vpmuludq %xmm10, %xmm5, %xmm9
vpmuludq %xmm10, %xmm6, %xmm11
vmovdqa 56(%rsp), %xmm1
vpmuludq %xmm1, %xmm5, %xmm10
vpaddq %xmm13, %xmm12, %xmm12
vpmuludq %xmm1, %xmm6, %xmm13
vpaddq %xmm9, %xmm12, %xmm12
vpmuludq %xmm1, %xmm4, %xmm9
vpaddq %xmm11, %xmm15, %xmm15
vmovdqa 24(%rsp), %xmm1
vpmuludq %xmm1, %xmm6, %xmm11
vpaddq %xmm10, %xmm15, %xmm10
vpmuludq %xmm1, %xmm3, %xmm15
vpaddq %xmm13, %xmm14, %xmm14
vpmuludq %xmm1, %xmm4, %xmm13
vpaddq %xmm9, %xmm12, %xmm9
vpmuludq %xmm1, %xmm5, %xmm12
vpaddq %xmm11, %xmm7, %xmm7
vpmuludq 168(%rsp), %xmm5, %xmm11
vpaddq %xmm15, %xmm9, %xmm9
vpmuludq 168(%rsp), %xmm6, %xmm6
vpaddq %xmm13, %xmm10, %xmm10
vpmuludq 168(%rsp), %xmm2, %xmm15
vpaddq %xmm12, %xmm14, %xmm14
vpmuludq 168(%rsp), %xmm3, %xmm13
vpaddq %xmm11, %xmm7, %xmm11
vpmuludq 168(%rsp), %xmm4, %xmm12
vpaddq %xmm6, %xmm8, %xmm6
vmovdqa 136(%rsp), %xmm8
vpmuludq %xmm8, %xmm4, %xmm7
vpaddq %xmm15, %xmm9, %xmm9
vpmuludq %xmm8, %xmm5, %xmm5
vpaddq %xmm13, %xmm10, %xmm10
vpmuludq %xmm8, %xmm2, %xmm15
vpaddq %xmm12, %xmm14, %xmm14
vpmuludq %xmm8, %xmm3, %xmm8
vpaddq %xmm7, %xmm11, %xmm11
vmovdqa 104(%rsp), %xmm7
vpmuludq %xmm7, %xmm3, %xmm13
vpaddq %xmm5, %xmm6, %xmm6
vpmuludq %xmm7, %xmm4, %xmm4
vpaddq %xmm15, %xmm10, %xmm10
vpmuludq %xmm7, %xmm2, %xmm15
vpaddq %xmm8, %xmm14, %xmm14
vmovdqa 72(%rsp), %xmm5
vpmuludq %xmm5, %xmm2, %xmm7
vpaddq %xmm13, %xmm11, %xmm11
vpmuludq %xmm5, %xmm3, %xmm3
vpaddq %xmm4, %xmm6, %xmm6
vpmuludq 40(%rsp), %xmm2, %xmm2
vpaddq %xmm15, %xmm14, %xmm14
vpaddq %xmm7, %xmm11, %xmm11
vpaddq %xmm3, %xmm6, %xmm6
vpaddq %xmm2, %xmm6, %xmm2
vpsrlq $26, %xmm9, %xmm12
vpsrlq $26, %xmm11, %xmm5
vpand %xmm0, %xmm9, %xmm9
vpand %xmm0, %xmm11, %xmm11
vpaddq %xmm12, %xmm10, %xmm10
vpaddq %xmm5, %xmm2, %xmm2
vpsrlq $26, %xmm10, %xmm3
vpsrlq $26, %xmm2, %xmm7
vpand %xmm0, %xmm10, %xmm10
vpand %xmm0, %xmm2, %xmm2
vpaddq %xmm3, %xmm14, %xmm3
vpmuludq 184(%rsp), %xmm7, %xmm7
vpaddq %xmm7, %xmm9, %xmm9
vpsrlq $26, %xmm3, %xmm6
vpsrlq $26, %xmm9, %xmm12
vpand %xmm0, %xmm3, %xmm3
vpand %xmm0, %xmm9, %xmm7
vpaddq %xmm6, %xmm11, %xmm6
vpaddq %xmm12, %xmm10, %xmm12
vpsrlq $26, %xmm6, %xmm8
vpand %xmm0, %xmm6, %xmm6
vpaddq %xmm8, %xmm2, %xmm2
subq $64, %rcx
addq $64, %rax
cmpq $63, %rcx
ja .L22
vmovdqa 168(%rsp), %xmm1
leaq -64(%rdx), %rax
andq $-64, %rax
leaq 64(%rsi,%rax), %rsi
andl $63, %edx
.L20:
cmpq $31, %rdx
jbe .L23
vpmuludq 120(%rsp), %xmm2, %xmm11
vmovdqa 88(%rsp), %xmm4
vpmuludq %xmm4, %xmm6, %xmm0
vpmuludq %xmm4, %xmm2, %xmm10
vmovdqa 56(%rsp), %xmm4
vpmuludq %xmm4, %xmm6, %xmm8
vpmuludq %xmm4, %xmm2, %xmm5
vpaddq %xmm0, %xmm11, %xmm11
vpmuludq %xmm4, %xmm3, %xmm0
vmovdqa 24(%rsp), %xmm13
vpmuludq %xmm13, %xmm2, %xmm4
vpaddq %xmm8, %xmm10, %xmm10
vpmuludq %xmm13, %xmm12, %xmm8
vpmuludq %xmm13, %xmm3, %xmm9
vpaddq %xmm0, %xmm11, %xmm11
vpmuludq %xmm13, %xmm6, %xmm13
vpmuludq %xmm1, %xmm6, %xmm0
vpaddq %xmm8, %xmm11, %xmm8
vpmuludq %xmm1, %xmm2, %xmm2
vpaddq %xmm9, %xmm10, %xmm9
vpmuludq %xmm1, %xmm7, %xmm11
vpaddq %xmm13, %xmm5, %xmm5
vpmuludq %xmm1, %xmm12, %xmm10
vpaddq %xmm0, %xmm4, %xmm0
vpmuludq %xmm1, %xmm3, %xmm1
vmovdqa 136(%rsp), %xmm4
vpmuludq %xmm4, %xmm3, %xmm14
vpaddq %xmm11, %xmm8, %xmm11
vpmuludq %xmm4, %xmm6, %xmm6
vpaddq %xmm10, %xmm9, %xmm9
vpmuludq %xmm4, %xmm7, %xmm15
vpaddq %xmm1, %xmm5, %xmm5
vpmuludq %xmm4, %xmm12, %xmm1
vpaddq %xmm14, %xmm0, %xmm0
vmovdqa 104(%rsp), %xmm4
vpmuludq %xmm4, %xmm12, %xmm8
vpaddq %xmm6, %xmm2, %xmm2
vpmuludq %xmm4, %xmm3, %xmm3
vpaddq %xmm15, %xmm9, %xmm9
vpmuludq %xmm4, %xmm7, %xmm10
vpaddq %xmm1, %xmm5, %xmm1
vmovdqa 72(%rsp), %xmm4
vpmuludq %xmm4, %xmm7, %xmm15
vpaddq %xmm8, %xmm0, %xmm0
vpmuludq %xmm4, %xmm12, %xmm12
vpaddq %xmm3, %xmm2, %xmm2
vpmuludq 40(%rsp), %xmm7, %xmm7
vpaddq %xmm10, %xmm1, %xmm1
vpaddq %xmm15, %xmm0, %xmm0
vpaddq %xmm12, %xmm2, %xmm2
vpaddq %xmm7, %xmm2, %xmm2
movl $((1 << 26) - 1), %r8d
testq %rsi, %rsi
vmovd %r8d, %xmm15
je .L24
vmovdqu (%rsi), %xmm4
vmovdqu 16(%rsi), %xmm3
vpunpckldq %xmm3, %xmm4, %xmm5
vpunpckhdq %xmm3, %xmm4, %xmm3
vpxor %xmm4, %xmm4, %xmm4
vpunpckldq %xmm4, %xmm5, %xmm7
vpunpckhdq %xmm4, %xmm5, %xmm5
vpunpckldq %xmm4, %xmm3, %xmm6
vpunpckhdq %xmm4, %xmm3, %xmm3
vpsllq $6, %xmm5, %xmm5
vpsllq $12, %xmm6, %xmm6
vpsllq $18, %xmm3, %xmm3
vpaddq %xmm7, %xmm11, %xmm11
vpaddq %xmm5, %xmm9, %xmm9
vpaddq %xmm6, %xmm1, %xmm1
vpaddq %xmm3, %xmm0, %xmm0
vpaddq 152(%rsp), %xmm2, %xmm2
.L24:
vpshufd $68, %xmm15, %xmm15
vpsrlq $26, %xmm11, %xmm12
vpsrlq $26, %xmm0, %xmm3
vpand %xmm15, %xmm11, %xmm11
vpand %xmm15, %xmm0, %xmm6
vpaddq %xmm12, %xmm9, %xmm9
vpaddq %xmm3, %xmm2, %xmm2
vpsrlq $26, %xmm9, %xmm3
vpsrlq $26, %xmm2, %xmm7
vpand %xmm15, %xmm9, %xmm9
vpand %xmm15, %xmm2, %xmm2
vpaddq %xmm3, %xmm1, %xmm3
vpmuludq 184(%rsp), %xmm7, %xmm7
vpaddq %xmm7, %xmm11, %xmm7
vpsrlq $26, %xmm3, %xmm4
vpsrlq $26, %xmm7, %xmm1
vpand %xmm15, %xmm3, %xmm3
vpand %xmm15, %xmm7, %xmm7
vpaddq %xmm4, %xmm6, %xmm6
vpaddq %xmm1, %xmm9, %xmm12
vpsrlq $26, %xmm6, %xmm0
vpand %xmm15, %xmm6, %xmm6
vpaddq %xmm0, %xmm2, %xmm2
.L23:
testq %rsi, %rsi
je .L25
vpshufd $8, %xmm7, %xmm7
vpshufd $8, %xmm12, %xmm12
vpshufd $8, %xmm3, %xmm3
vpshufd $8, %xmm6, %xmm6
vpshufd $8, %xmm2, %xmm2
vpunpcklqdq %xmm12, %xmm7, %xmm7
vpunpcklqdq %xmm6, %xmm3, %xmm3
vmovdqu %xmm7, (%rdi)
vmovdqu %xmm3, 16(%rdi)
vmovq %xmm2, 32(%rdi)
jmp .L11
.L25:
vpsrldq $8, %xmm7, %xmm0
vpaddq %xmm0, %xmm7, %xmm7
vpsrldq $8, %xmm12, %xmm0
vpaddq %xmm0, %xmm12, %xmm12
vpsrldq $8, %xmm3, %xmm0
vpaddq %xmm0, %xmm3, %xmm3
vpsrldq $8, %xmm6, %xmm0
vpaddq %xmm0, %xmm6, %xmm6
vpsrldq $8, %xmm2, %xmm0
vpaddq %xmm0, %xmm2, %xmm2
vmovd %xmm7, %eax
vmovd %xmm12, %edx
movl %eax, %r9d
shrl $26, %r9d
addl %edx, %r9d
movl %r9d, %r8d
andl $67108863, %r8d
vmovd %xmm3, %edx
shrl $26, %r9d
addl %edx, %r9d
vmovd %xmm6, %edx
movl %r9d, %ecx
shrl $26, %ecx
addl %edx, %ecx
movl %ecx, %esi
andl $67108863, %esi
vmovd %xmm2, %r10d
movl %r8d, %r11d
salq $26, %r11
andl $67108863, %eax
orq %rax, %r11
movabsq $17592186044415, %rax
andq %rax, %r11
andl $67108863, %r9d
salq $8, %r9
shrl $18, %r8d
movl %r8d, %r8d
orq %r8, %r9
movq %rsi, %rdx
salq $34, %rdx
orq %rdx, %r9
andq %rax, %r9
shrl $26, %ecx
addl %r10d, %ecx
salq $16, %rcx
shrl $10, %esi
movl %esi, %esi
orq %rsi, %rcx
movabsq $4398046511103, %r10
movq %rcx, %r8
andq %r10, %r8
shrq $42, %rcx
leaq (%rcx,%rcx,4), %rdx
addq %r11, %rdx
movq %rdx, %rsi
andq %rax, %rsi
shrq $44, %rdx
addq %r9, %rdx
movq %rdx, %rcx
andq %rax, %rcx
shrq $44, %rdx
addq %r8, %rdx
andq %rdx, %r10
shrq $42, %rdx
leaq (%rsi,%rdx,4), %rsi
leaq (%rsi,%rdx), %r11
movq %r11, %rbx
andq %rax, %rbx
shrq $44, %r11
addq %rcx, %r11
leaq 5(%rbx), %r9
movq %r9, %r8
shrq $44, %r8
addq %r11, %r8
movabsq $-4398046511104, %rsi
addq %r10, %rsi
movq %r8, %rdx
shrq $44, %rdx
addq %rdx, %rsi
movq %rsi, %rdx
shrq $63, %rdx
subq $1, %rdx
movq %rdx, %rcx
notq %rcx
andq %rcx, %rbx
andq %rcx, %r11
andq %r10, %rcx
andq %rax, %r9
andq %rdx, %r9
orq %r9, %rbx
movq %rbx, (%rdi)
andq %r8, %rax
andq %rdx, %rax
orq %rax, %r11
movq %r11, 8(%rdi)
andq %rsi, %rdx
orq %rcx, %rdx
movq %rdx, 16(%rdi)
.L11:
movq -8(%rbp), %rbx
leave
ret
FN_END poly1305_blocks_avx

GLOBAL_HIDDEN_FN poly1305_finish_ext_avx
poly1305_finish_ext_avx_local:
pushq %r12
pushq %rbp
pushq %rbx
subq $32, %rsp
movq %rdi, %rbx
movq %rdx, %rbp
movq %rcx, %r12
testq %rdx, %rdx
je .L30
movq $0, (%rsp)
movq $0, 8(%rsp)
movq $0, 16(%rsp)
movq $0, 24(%rsp)
movq %rsp, %rax
subq %rsp, %rsi
testb $16, %dl
je .L31
vmovdqu (%rsp,%rsi), %xmm0
vmovdqa %xmm0, (%rsp)
addq $16, %rax
.L31:
testb $8, %bpl
je .L32
movq (%rax,%rsi), %rdx
movq %rdx, (%rax)
addq $8, %rax
.L32:
testb $4, %bpl
je .L33
movl (%rax,%rsi), %edx
movl %edx, (%rax)
addq $4, %rax
.L33:
testb $2, %bpl
je .L34
movzwl (%rax,%rsi), %edx
movw %dx, (%rax)
addq $2, %rax
.L34:
testb $1, %bpl
je .L35
movzbl (%rax,%rsi), %edx
movb %dl, (%rax)
.L35:
cmpq $16, %rbp
je .L36
movb $1, (%rsp,%rbp)
movq 120(%rbx), %rdx
cmpq $16, %rbp
sbbq %rax, %rax
andl $4, %eax
addq $4, %rax
.L37:
orq %rdx, %rax
movq %rax, 120(%rbx)
movq %rsp, %rsi
movl $32, %edx
movq %rbx, %rdi
call poly1305_blocks_avx_local
.L30:
movq 120(%rbx), %rax
testb $1, %al
je .L38
subq $1, %rbp
cmpq $15, %rbp
jbe .L39
orq $16, %rax
movq %rax, 120(%rbx)
jmp .L40
.L39:
orq $32, %rax
movq %rax, 120(%rbx)
.L40:
movl $32, %edx
movl $0, %esi
movq %rbx, %rdi
call poly1305_blocks_avx_local
.L38:
movq 8(%rbx), %rax
movq %rax, %rdx
salq $44, %rdx
orq (%rbx), %rdx
shrq $20, %rax
movq 16(%rbx), %rcx
salq $24, %rcx
orq %rcx, %rax
movq 104(%rbx), %rcx
movq 112(%rbx), %rsi
addq %rcx, %rdx
adcq %rsi, %rax
vpxor %xmm0, %xmm0, %xmm0
vmovdqu %xmm0, (%rbx)
vmovdqu %xmm0, 16(%rbx)
vmovdqu %xmm0, 32(%rbx)
vmovdqu %xmm0, 48(%rbx)
vmovdqu %xmm0, 64(%rbx)
vmovdqu %xmm0, 80(%rbx)
vmovdqu %xmm0, 96(%rbx)
vmovdqu %xmm0, 112(%rbx)
movq %rdx, (%r12)
movq %rax, 8(%r12)
jmp .L43
.L36:
movq 120(%rbx), %rdx
movl $4, %eax
jmp .L37
.L43:
addq $32, %rsp
popq %rbx
popq %rbp
popq %r12
ret
FN_END poly1305_finish_ext_avx

GLOBAL_HIDDEN_FN poly1305_auth_avx
cmp $128, %rdx
jb poly1305_auth_x86_local
pushq %rbp
movq %rsp, %rbp
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-64, %rsp
addq $-128, %rsp
movq %rdi, %r14
movq %rsi, %r12
movq %rdx, %rbx
movq %rsp, %rdi
movq %rcx, %rsi
call poly1305_init_ext_avx_local
movq %rbx, %r13
andq $-32, %r13
je .L46
movq %rsp, %rdi
movq %r13, %rdx
movq %r12, %rsi
call poly1305_blocks_avx_local
addq %r13, %r12
subq %r13, %rbx
.L46:
movq %rsp, %rdi
movq %r14, %rcx
movq %rbx, %rdx
movq %r12, %rsi
call poly1305_finish_ext_avx_local
leaq -32(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %rbp
ret
FN_END poly1305_auth_avx

+ 1093
- 0
src/libcryptobox/poly1305/avx2.S
File diff suppressed because it is too large
View File


+ 21
- 0
src/libcryptobox/poly1305/constants.S View File

@@ -0,0 +1,21 @@
SECTION_RODATA

.p2align 4
poly1305_constants_x86:
/* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000
/* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000
/* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000
/* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000
/* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000
/* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000
/* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000
/* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000
/* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000
/* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000
/* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000
/* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000
/* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe
/* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001
/* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001
/* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001
/* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003

+ 0
- 202
src/libcryptobox/poly1305/poly1305-donna-16.h View File

@@ -1,202 +0,0 @@
/*
poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition
*/

#if defined(_MSC_VER)
#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#define POLY1305_NOINLINE __attribute__((noinline))
#else
#define POLY1305_NOINLINE
#endif

#define poly1305_block_size 16

/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */
typedef struct poly1305_state_internal_t {
unsigned char buffer[poly1305_block_size];
size_t leftover;
unsigned short r[10];
unsigned short h[10];
unsigned short pad[8];
unsigned char final;
} poly1305_state_internal_t;

/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */
static unsigned short
U8TO16(const unsigned char *p) {
return
(((unsigned short)(p[0] & 0xff) ) |
((unsigned short)(p[1] & 0xff) << 8));
}

/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */
static void
U16TO8(unsigned char *p, unsigned short v) {
p[0] = (v ) & 0xff;
p[1] = (v >> 8) & 0xff;
}

void
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
size_t i;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff;
t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff;
t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03;
t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff;
t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff;
st->r[5] = ((t4 >> 1) ) & 0x1ffe;
t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff;
t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81;
t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff;
st->r[9] = ((t7 >> 5) ) & 0x007f;

/* h = 0 */
for (i = 0; i < 10; i++)
st->h[i] = 0;

/* save pad for later */
for (i = 0; i < 8; i++)
st->pad[i] = U8TO16(&key[16 + (2 * i)]);

st->leftover = 0;
st->final = 0;
}

static void
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */
unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
unsigned long d[10];
unsigned long c;

while (bytes >= poly1305_block_size) {
size_t i, j;

/* h += m[i] */
t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff;
t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff;
t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff;
t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff;
t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff;
st->h[5] += ((t4 >> 1) ) & 0x1fff;
t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff;
t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff;
t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff;
st->h[9] += ((t7 >> 5) ) | hibit;

/* h *= r, (partial) h %= p */
for (i = 0, c = 0; i < 10; i++) {
d[i] = c;
for (j = 0; j < 10; j++) {
d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j]));
/* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */
if (j == 4) {
c = (d[i] >> 13);
d[i] &= 0x1fff;
}
}
c += (d[i] >> 13);
d[i] &= 0x1fff;
}
c = ((c << 2) + c); /* c *= 5 */
c += d[0];
d[0] = ((unsigned short)c & 0x1fff);
c = (c >> 13);
d[1] += c;

for (i = 0; i < 10; i++)
st->h[i] = (unsigned short)d[i];

m += poly1305_block_size;
bytes -= poly1305_block_size;
}
}

POLY1305_NOINLINE void
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
unsigned short c;
unsigned short g[10];
unsigned short mask;
unsigned long f;
size_t i;

/* process the remaining block */
if (st->leftover) {
size_t i = st->leftover;
st->buffer[i++] = 1;
for (; i < poly1305_block_size; i++)
st->buffer[i] = 0;
st->final = 1;
poly1305_blocks(st, st->buffer, poly1305_block_size);
}

/* fully carry h */
c = st->h[1] >> 13;
st->h[1] &= 0x1fff;
for (i = 2; i < 10; i++) {
st->h[i] += c;
c = st->h[i] >> 13;
st->h[i] &= 0x1fff;
}
st->h[0] += (c * 5);
c = st->h[0] >> 13;
st->h[0] &= 0x1fff;
st->h[1] += c;
c = st->h[1] >> 13;
st->h[1] &= 0x1fff;
st->h[2] += c;

/* compute h + -p */
g[0] = st->h[0] + 5;
c = g[0] >> 13;
g[0] &= 0x1fff;
for (i = 1; i < 10; i++) {
g[i] = st->h[i] + c;
c = g[i] >> 13;
g[i] &= 0x1fff;
}
g[9] -= (1 << 13);

/* select h if h < p, or h + -p if h >= p */
mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1;
for (i = 0; i < 10; i++)
g[i] &= mask;
mask = ~mask;
for (i = 0; i < 10; i++)
st->h[i] = (st->h[i] & mask) | g[i];

/* h = h % (2^128) */
st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff;
st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff;
st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff;
st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff;
st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff;
st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff;
st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff;
st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff;

/* mac = (h + pad) % (2^128) */
f = (unsigned long)st->h[0] + st->pad[0];
st->h[0] = (unsigned short)f;
for (i = 1; i < 8; i++) {
f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16);
st->h[i] = (unsigned short)f;
}

for (i = 0; i < 8; i++)
U16TO8(mac + (i * 2), st->h[i]);

/* zero out the state */
for (i = 0; i < 10; i++)
st->h[i] = 0;
for (i = 0; i < 10; i++)
st->r[i] = 0;
for (i = 0; i < 8; i++)
st->pad[i] = 0;
}

+ 0
- 219
src/libcryptobox/poly1305/poly1305-donna-32.h View File

@@ -1,219 +0,0 @@
/*
poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
*/

#if defined(_MSC_VER)
#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#define POLY1305_NOINLINE __attribute__((noinline))
#else
#define POLY1305_NOINLINE
#endif

#define poly1305_block_size 16

/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */
typedef struct poly1305_state_internal_t {
unsigned long r[5];
unsigned long h[5];
unsigned long pad[4];
size_t leftover;
unsigned char buffer[poly1305_block_size];
unsigned char final;
} poly1305_state_internal_t;

/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
static unsigned long
U8TO32(const unsigned char *p) {
return
(((unsigned long)(p[0] & 0xff) ) |
((unsigned long)(p[1] & 0xff) << 8) |
((unsigned long)(p[2] & 0xff) << 16) |
((unsigned long)(p[3] & 0xff) << 24));
}

/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
static void
U32TO8(unsigned char *p, unsigned long v) {
p[0] = (v ) & 0xff;
p[1] = (v >> 8) & 0xff;
p[2] = (v >> 16) & 0xff;
p[3] = (v >> 24) & 0xff;
}

void
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff;
st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03;
st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff;
st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff;
st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff;

/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->h[3] = 0;
st->h[4] = 0;

/* save pad for later */
st->pad[0] = U8TO32(&key[16]);
st->pad[1] = U8TO32(&key[20]);
st->pad[2] = U8TO32(&key[24]);
st->pad[3] = U8TO32(&key[28]);

st->leftover = 0;
st->final = 0;
}

static void
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
unsigned long r0,r1,r2,r3,r4;
unsigned long s1,s2,s3,s4;
unsigned long h0,h1,h2,h3,h4;
unsigned long long d0,d1,d2,d3,d4;
unsigned long c;

r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];
r3 = st->r[3];
r4 = st->r[4];

s1 = r1 * 5;
s2 = r2 * 5;
s3 = r3 * 5;
s4 = r4 * 5;

h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];

while (bytes >= poly1305_block_size) {
/* h += m[i] */
h0 += (U8TO32(m+ 0) ) & 0x3ffffff;
h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
h4 += (U8TO32(m+12) >> 8) | hibit;

/* h *= r */
d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1);
d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2);
d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3);
d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4);
d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0);

/* (partial) h %= p */
c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff;
d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff;
d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff;
d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff;
d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff;
h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
h1 += c;

m += poly1305_block_size;
bytes -= poly1305_block_size;
}

st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
st->h[3] = h3;
st->h[4] = h4;
}

POLY1305_NOINLINE void
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
unsigned long h0,h1,h2,h3,h4,c;
unsigned long g0,g1,g2,g3,g4;
unsigned long long f;
unsigned long mask;

/* process the remaining block */
if (st->leftover) {
size_t i = st->leftover;
st->buffer[i++] = 1;
for (; i < poly1305_block_size; i++)
st->buffer[i] = 0;
st->final = 1;
poly1305_blocks(st, st->buffer, poly1305_block_size);
}

/* fully carry h */
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];

c = h1 >> 26; h1 = h1 & 0x3ffffff;
h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
h1 += c;

/* compute h + -p */
g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
g4 = h4 + c - (1 << 26);

/* select h if h < p, or h + -p if h >= p */
mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
g4 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
h2 = (h2 & mask) | g2;
h3 = (h3 & mask) | g3;
h4 = (h4 & mask) | g4;

/* h = h % (2^128) */
h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;

/* mac = (h + pad) % (2^128) */
f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f;
f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f;
f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f;
f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f;

U32TO8(mac + 0, h0);
U32TO8(mac + 4, h1);
U32TO8(mac + 8, h2);
U32TO8(mac + 12, h3);

/* zero out the state */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->h[3] = 0;
st->h[4] = 0;
st->r[0] = 0;
st->r[1] = 0;
st->r[2] = 0;
st->r[3] = 0;
st->r[4] = 0;
st->pad[0] = 0;
st->pad[1] = 0;
st->pad[2] = 0;
st->pad[3] = 0;
}


+ 0
- 224
src/libcryptobox/poly1305/poly1305-donna-64.h View File

@@ -1,224 +0,0 @@
/*
poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
*/

#if defined(_MSC_VER)
#include <intrin.h>

typedef struct uint128_t {
unsigned long long lo;
unsigned long long hi;
} uint128_t;

#define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
#define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; }
#define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); }
#define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
#define LO(in) (in.lo)

#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
#endif

#define MUL(out, x, y) out = ((uint128_t)x * y)
#define ADD(out, in) out += in
#define ADDLO(out, in) out += in
#define SHR(in, shift) (unsigned long long)(in >> (shift))
#define LO(in) (unsigned long long)(in)

#define POLY1305_NOINLINE __attribute__((noinline))
#endif

#define poly1305_block_size 16

/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */
typedef struct poly1305_state_internal_t {
unsigned long long r[3];
unsigned long long h[3];
unsigned long long pad[2];
size_t leftover;
unsigned char buffer[poly1305_block_size];
unsigned char final;
} poly1305_state_internal_t;

/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
static unsigned long long
U8TO64(const unsigned char *p) {
return
(((unsigned long long)(p[0] & 0xff) ) |
((unsigned long long)(p[1] & 0xff) << 8) |
((unsigned long long)(p[2] & 0xff) << 16) |
((unsigned long long)(p[3] & 0xff) << 24) |
((unsigned long long)(p[4] & 0xff) << 32) |
((unsigned long long)(p[5] & 0xff) << 40) |
((unsigned long long)(p[6] & 0xff) << 48) |
((unsigned long long)(p[7] & 0xff) << 56));
}

/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
static void
U64TO8(unsigned char *p, unsigned long long v) {
p[0] = (v ) & 0xff;
p[1] = (v >> 8) & 0xff;
p[2] = (v >> 16) & 0xff;
p[3] = (v >> 24) & 0xff;
p[4] = (v >> 32) & 0xff;
p[5] = (v >> 40) & 0xff;
p[6] = (v >> 48) & 0xff;
p[7] = (v >> 56) & 0xff;
}

void
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
unsigned long long t0,t1;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
t0 = U8TO64(&key[0]);
t1 = U8TO64(&key[8]);

st->r[0] = ( t0 ) & 0xffc0fffffff;
st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;

/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;

/* save pad for later */
st->pad[0] = U8TO64(&key[16]);
st->pad[1] = U8TO64(&key[24]);

st->leftover = 0;
st->final = 0;
}

static void
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */
unsigned long long r0,r1,r2;
unsigned long long s1,s2;
unsigned long long h0,h1,h2;
unsigned long long c;
uint128_t d0,d1,d2,d;

r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];

h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];

s1 = r1 * (5 << 2);
s2 = r2 * (5 << 2);

while (bytes >= poly1305_block_size) {
unsigned long long t0,t1;

/* h += m[i] */
t0 = U8TO64(&m[0]);
t1 = U8TO64(&m[8]);

h0 += (( t0 ) & 0xfffffffffff);
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;

/* h *= r */
MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);

/* (partial) h %= p */
c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
h1 += c;

m += poly1305_block_size;
bytes -= poly1305_block_size;
}

st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
}


POLY1305_NOINLINE void
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
unsigned long long h0,h1,h2,c;
unsigned long long g0,g1,g2;
unsigned long long t0,t1;

/* process the remaining block */
if (st->leftover) {
size_t i = st->leftover;
st->buffer[i] = 1;
for (i = i + 1; i < poly1305_block_size; i++)
st->buffer[i] = 0;
st->final = 1;
poly1305_blocks(st, st->buffer, poly1305_block_size);
}

/* fully carry h */
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];

c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c;

/* compute h + -p */
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
g2 = h2 + c - ((unsigned long long)1 << 42);

/* select h if h < p, or h + -p if h >= p */
c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
g0 &= c;
g1 &= c;
g2 &= c;
c = ~c;
h0 = (h0 & c) | g0;
h1 = (h1 & c) | g1;
h2 = (h2 & c) | g2;

/* h = (h + pad) */
t0 = st->pad[0];
t1 = st->pad[1];

h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;

/* mac = h % (2^128) */
h0 = ((h0 ) | (h1 << 44));
h1 = ((h1 >> 20) | (h2 << 24));

U64TO8(&mac[0], h0);
U64TO8(&mac[8], h1);

/* zero out the state */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->r[0] = 0;
st->r[1] = 0;
st->r[2] = 0;
st->pad[0] = 0;
st->pad[1] = 0;
}


+ 0
- 186
src/libcryptobox/poly1305/poly1305-donna-8.h View File

@@ -1,186 +0,0 @@
/*
poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition

based on the public domain reference version in supercop by djb
*/

#if defined(_MSC_VER)
#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#define POLY1305_NOINLINE __attribute__((noinline))
#else
#define POLY1305_NOINLINE
#endif

#define poly1305_block_size 16

/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */
typedef struct poly1305_state_internal_t {
unsigned char buffer[poly1305_block_size];
size_t leftover;
unsigned char h[17];
unsigned char r[17];
unsigned char pad[17];
unsigned char final;
} poly1305_state_internal_t;

void
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
size_t i;

st->leftover = 0;

/* h = 0 */
for (i = 0; i < 17; i++)
st->h[i] = 0;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
st->r[ 0] = key[ 0] & 0xff;
st->r[ 1] = key[ 1] & 0xff;
st->r[ 2] = key[ 2] & 0xff;
st->r[ 3] = key[ 3] & 0x0f;
st->r[ 4] = key[ 4] & 0xfc;
st->r[ 5] = key[ 5] & 0xff;
st->r[ 6] = key[ 6] & 0xff;
st->r[ 7] = key[ 7] & 0x0f;
st->r[ 8] = key[ 8] & 0xfc;
st->r[ 9] = key[ 9] & 0xff;
st->r[10] = key[10] & 0xff;
st->r[11] = key[11] & 0x0f;
st->r[12] = key[12] & 0xfc;
st->r[13] = key[13] & 0xff;
st->r[14] = key[14] & 0xff;
st->r[15] = key[15] & 0x0f;
st->r[16] = 0;

/* save pad for later */
for (i = 0; i < 16; i++)
st->pad[i] = key[i + 16];
st->pad[16] = 0;

st->final = 0;
}

static void
poly1305_add(unsigned char h[17], const unsigned char c[17]) {
unsigned short u;
unsigned int i;
for (u = 0, i = 0; i < 17; i++) {
u += (unsigned short)h[i] + (unsigned short)c[i];
h[i] = (unsigned char)u & 0xff;
u >>= 8;
}
}

static void
poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) {
unsigned long u;
unsigned int i;
u = 0;
for (i = 0; i < 16; i++) {
u += hr[i];
h[i] = (unsigned char)u & 0xff;
u >>= 8;
}
u += hr[16];
h[16] = (unsigned char)u & 0x03;
u >>= 2;
u += (u << 2); /* u *= 5; */
for (i = 0; i < 16; i++) {
u += h[i];
h[i] = (unsigned char)u & 0xff;
u >>= 8;
}
h[16] += (unsigned char)u;
}

static void
poly1305_freeze(unsigned char h[17]) {
static const unsigned char minusp[17] = {
0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xfc
};
unsigned char horig[17], negative;
unsigned int i;

/* compute h + -p */
for (i = 0; i < 17; i++)
horig[i] = h[i];
poly1305_add(h, minusp);

/* select h if h < p, or h + -p if h >= p */
negative = -(h[16] >> 7);
for (i = 0; i < 17; i++)
h[i] ^= negative & (horig[i] ^ h[i]);
}

static void
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
const unsigned char hibit = st->final ^ 1; /* 1 << 128 */

while (bytes >= poly1305_block_size) {
unsigned long hr[17], u;
unsigned char c[17];
unsigned int i, j;

/* h += m */
for (i = 0; i < 16; i++)
c[i] = m[i];
c[16] = hibit;
poly1305_add(st->h, c);

/* h *= r */
for (i = 0; i < 17; i++) {
u = 0;
for (j = 0; j <= i ; j++) {
u += (unsigned short)st->h[j] * st->r[i - j];
}
for (j = i + 1; j < 17; j++) {
unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j];
v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */
u += v;
}
hr[i] = u;
}

/* (partial) h %= p */
poly1305_squeeze(st->h, hr);

m += poly1305_block_size;
bytes -= poly1305_block_size;
}
}

POLY1305_NOINLINE void
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
size_t i;

/* process the remaining block */
if (st->leftover) {
size_t i = st->leftover;
st->buffer[i++] = 1;
for (; i < poly1305_block_size; i++)
st->buffer[i] = 0;
st->final = 1;
poly1305_blocks(st, st->buffer, poly1305_block_size);
}

/* fully reduce h */
poly1305_freeze(st->h);

/* h = (h + pad) % (1 << 128) */
poly1305_add(st->h, st->pad);
for (i = 0; i < 16; i++)
mac[i] = st->h[i];

/* zero out the state */
for (i = 0; i < 17; i++)
st->h[i] = 0;
for (i = 0; i < 17; i++)
st->r[i] = 0;
for (i = 0; i < 17; i++)
st->pad[i] = 0;
}

+ 0
- 201
src/libcryptobox/poly1305/poly1305-donna.c View File

@@ -1,201 +0,0 @@
#include "poly1305-donna.h"

#if defined(POLY1305_8BIT)
#include "poly1305-donna-8.h"
#elif defined(POLY1305_16BIT)
#include "poly1305-donna-16.h"
#elif defined(POLY1305_32BIT)
#include "poly1305-donna-32.h"
#elif defined(POLY1305_64BIT)
#include "poly1305-donna-64.h"
#else

/* auto detect between 32bit / 64bit */
#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__))
#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64))
#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4))))

#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT)
#include "poly1305-donna-64.h"
#else
#include "poly1305-donna-32.h"
#endif

#endif

void
poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) {
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
size_t i;

/* handle leftover */
if (st->leftover) {
size_t want = (poly1305_block_size - st->leftover);
if (want > bytes)
want = bytes;
for (i = 0; i < want; i++)
st->buffer[st->leftover + i] = m[i];
bytes -= want;
m += want;
st->leftover += want;
if (st->leftover < poly1305_block_size)
return;
poly1305_blocks(st, st->buffer, poly1305_block_size);
st->leftover = 0;
}

/* process full blocks */
if (bytes >= poly1305_block_size) {
size_t want = (bytes & ~(poly1305_block_size - 1));
poly1305_blocks(st, m, want);
m += want;
bytes -= want;
}

/* store leftover */
if (bytes) {
for (i = 0; i < bytes; i++)
st->buffer[st->leftover + i] = m[i];
st->leftover += bytes;
}
}

void
poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) {
poly1305_context ctx;
poly1305_init(&ctx, key);
poly1305_update(&ctx, m, bytes);
poly1305_finish(&ctx, mac);
}

int
poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) {
size_t i;
unsigned int dif = 0;
for (i = 0; i < 16; i++)
dif |= (mac1[i] ^ mac2[i]);
dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
return (dif & 1);
}


/* test a few basic operations */
int
poly1305_power_on_self_test(void) {
/* example from nacl */
static const unsigned char nacl_key[32] = {
0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
};

static const unsigned char nacl_msg[131] = {
0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
0xe3,0x55,0xa5
};

static const unsigned char nacl_mac[16] = {
0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9
};

/* generates a final value of (2^130 - 2) == 3 */
static const unsigned char wrap_key[32] = {
0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};

static const unsigned char wrap_msg[16] = {
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
};

static const unsigned char wrap_mac[16] = {
0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};

/*
mac of the macs of messages of length 0 to 256, where the key and messages
have all their values set to the length
*/
static const unsigned char total_key[32] = {
0x01,0x02,0x03,0x04,0x05,0x06,0x07,
0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff
};

static const unsigned char total_mac[16] = {
0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd,
0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39
};

poly1305_context ctx;
poly1305_context total_ctx;
unsigned char all_key[32];
unsigned char all_msg[256];
unsigned char mac[16];
size_t i, j;
int result = 1;

for (i = 0; i < sizeof(mac); i++)
mac[i] = 0;
poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key);
result &= poly1305_verify(nacl_mac, mac);

for (i = 0; i < sizeof(mac); i++)
mac[i] = 0;
poly1305_init(&ctx, nacl_key);
poly1305_update(&ctx, nacl_msg + 0, 32);
poly1305_update(&ctx, nacl_msg + 32, 64);
poly1305_update(&ctx, nacl_msg + 96, 16);
poly1305_update(&ctx, nacl_msg + 112, 8);
poly1305_update(&ctx, nacl_msg + 120, 4);
poly1305_update(&ctx, nacl_msg + 124, 2);
poly1305_update(&ctx, nacl_msg + 126, 1);
poly1305_update(&ctx, nacl_msg + 127, 1);
poly1305_update(&ctx, nacl_msg + 128, 1);
poly1305_update(&ctx, nacl_msg + 129, 1);
poly1305_update(&ctx, nacl_msg + 130, 1);
poly1305_finish(&ctx, mac);
result &= poly1305_verify(nacl_mac, mac);

for (i = 0; i < sizeof(mac); i++)
mac[i] = 0;
poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key);
result &= poly1305_verify(wrap_mac, mac);

poly1305_init(&total_ctx, total_key);
for (i = 0; i < 256; i++) {
/* set key and message to 'i,i,i..' */
for (j = 0; j < sizeof(all_key); j++)
all_key[j] = i;
for (j = 0; j < i; j++)
all_msg[j] = i;
poly1305_auth(mac, all_msg, i, all_key);
poly1305_update(&total_ctx, mac, 16);
}
poly1305_finish(&total_ctx, mac);
result &= poly1305_verify(total_mac, mac);

return result;
}

+ 0
- 20
src/libcryptobox/poly1305/poly1305-donna.h View File

@@ -1,20 +0,0 @@
#ifndef POLY1305_DONNA_H
#define POLY1305_DONNA_H

#include <stddef.h>

typedef struct poly1305_context {
size_t aligner;
unsigned char opaque[136];
} poly1305_context;

void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);
void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);
void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);
void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);

int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);
int poly1305_power_on_self_test(void);

#endif /* POLY1305_DONNA_H */


+ 222
- 0
src/libcryptobox/poly1305/poly1305.c View File

@@ -0,0 +1,222 @@
/*
* Copyright (c) 2015, Vsevolod Stakhov
* Copyright (c) 2015, Andrew Moon
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "config.h"
#include "cryptobox.h"
#include "poly1305.h"
#include "platform_config.h"

extern unsigned long cpu_config;

typedef struct poly1305_state_internal_t
{
unsigned char opaque[192]; /* largest state required (AVX2) */
size_t leftover, block_size;
unsigned char buffer[64]; /* largest blocksize (AVX2) */
} poly1305_state_internal;

typedef struct poly1305_impl_t
{
unsigned long cpu_flags;
const char *desc;

size_t (*block_size)(void);
void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint);
void (*blocks)(void *state, const unsigned char *in, size_t inlen);
void (*finish_ext)(void *state, const unsigned char *in, size_t remaining,
unsigned char *mac);
void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen,
const poly1305_key *key);
} poly1305_impl_t;

#define POLY1305_DECLARE(ext) \
size_t poly1305_block_size_##ext(void); \
void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \
void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \
void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \
void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key);

#define POLY1305_IMPL(cpuflags, desc, ext) \
{(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext}

#if defined(HAVE_AVX2)
POLY1305_DECLARE(avx2)
#define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2)
#endif
#if defined(HAVE_AVX)
POLY1305_DECLARE(avx)
#define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx)
#endif
#if defined(HAVE_SSE2)
POLY1305_DECLARE(sse2)
#define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2)
#endif

POLY1305_DECLARE(ref)
#define POLY1305_GENERIC POLY1305_IMPL(0, "generic", ref)

/* list implemenations from most optimized to least, with generic as the last entry */
static const poly1305_impl_t poly1305_list[] =
{
POLY1305_GENERIC,

#if defined(POLY1305_AVX2)
POLY1305_AVX2,
#endif
#if defined(POLY1305_AVX)
POLY1305_AVX,
#endif
#if defined(POLY1305_SSE2)
POLY1305_SSE2,
#endif
};

static const poly1305_impl_t *poly1305_opt = &poly1305_list[0];
;

/* is the pointer aligned on a word boundary? */
static int poly1305_is_aligned(const void *p)
{
return ((size_t) p & (sizeof(size_t) - 1)) == 0;
}

void poly1305_load(void)
{
guint i;

if (cpu_config != 0) {
for (i = 0; i < G_N_ELEMENTS(poly1305_list); i++) {
if (poly1305_list[i].cpu_flags & cpu_config) {
poly1305_opt = &poly1305_list[i];
break;
}
}
}
}

/* processes inlen bytes (full blocks only), handling input alignment */
static void poly1305_consume(poly1305_state_internal *state,
const unsigned char *in, size_t inlen)
{
int in_aligned;

/* it's ok to call with 0 bytes */
if (!inlen)
return;

/* if everything is aligned, handle directly */
in_aligned = poly1305_is_aligned (in);
if (in_aligned) {
poly1305_opt->blocks (state->opaque, in, inlen);
return;
}

/* copy the unaligned data to an aligned buffer and process in chunks */
while (inlen) {
unsigned char buffer[1024];
const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen;
memcpy (buffer, in, bytes);
poly1305_opt->blocks (state->opaque, buffer, bytes);
in += bytes;
inlen -= bytes;
}
}

void poly1305_init(poly1305_state *S, const poly1305_key *key)
{
poly1305_state_internal *state = (poly1305_state_internal *) S;
poly1305_opt->init_ext (state->opaque, key, 0);
state->leftover = 0;
state->block_size = poly1305_opt->block_size ();
}

void poly1305_init_ext(poly1305_state *S, const poly1305_key *key,
size_t bytes_hint)
{
poly1305_state_internal *state = (poly1305_state_internal *) S;
poly1305_opt->init_ext (state->opaque, key, bytes_hint);
state->leftover = 0;
state->block_size = poly1305_opt->block_size ();
}

void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen)
{
poly1305_state_internal *state = (poly1305_state_internal *) S;

/* handle leftover */
if (state->leftover) {
size_t want = (state->block_size - state->leftover);
if (want > inlen)
want = inlen;
memcpy (state->buffer + state->leftover, in, want);
inlen -= want;
in += want;
state->leftover += want;
if (state->leftover < state->block_size)
return;
poly1305_opt->blocks (state->opaque, state->buffer, state->block_size);
state->leftover = 0;
}

/* process full blocks */
if (inlen >= state->block_size) {
size_t want = (inlen & ~(state->block_size - 1));
poly1305_consume (state, in, want);
in += want;
inlen -= want;
}

/* store leftover */
if (inlen) {
memcpy (state->buffer + state->leftover, in, inlen);
state->leftover += inlen;
}
}

void poly1305_finish(poly1305_state *S, unsigned char *mac)
{
poly1305_state_internal *state = (poly1305_state_internal *) S;
poly1305_opt->finish_ext (state->opaque, state->buffer, state->leftover,
mac);
}

void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen,
const poly1305_key *key)
{
poly1305_opt->auth (mac, in, inlen, key);
}

int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16])
{
size_t i;
unsigned int dif = 0;

for (i = 0; i < 16; i++) {
dif |= (mac1[i] ^ mac2[i]);
}

dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
return (dif & 1);
}

+ 38
- 0
src/libcryptobox/poly1305/poly1305.h View File

@@ -0,0 +1,38 @@
#ifndef POLY1305_H
#define POLY1305_H

#include <stddef.h>

#if defined(__cplusplus)
extern "C"
{
#endif

typedef struct poly1305_state
{
unsigned char opaque[320];
} poly1305_state;

typedef struct poly1305_key
{
unsigned char b[32];
} poly1305_key;

void poly1305_init(poly1305_state *S, const poly1305_key *key);
void poly1305_init_ext(poly1305_state *S, const poly1305_key *key,
size_t bytes_hint);
void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen);
void poly1305_finish(poly1305_state *S, unsigned char *mac);

void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen,
const poly1305_key *key);
int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);

void poly1305_load(void);

#if defined(__cplusplus)
}
#endif

#endif /* POLY1305_H */


+ 237
- 0
src/libcryptobox/poly1305/ref-32.c View File

@@ -0,0 +1,237 @@
/*
poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition

assumes the existence of uint32_t and uint64_t
*/

#include "config.h"

enum {
POLY1305_BLOCK_SIZE = 16
};

typedef struct poly1305_state_ref_t {
uint32_t r[5];
uint32_t h[5];
uint32_t pad[4];
unsigned char final;
} poly1305_state_ref_t;

/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
static uint32_t
U8TO32(const unsigned char *p) {
return
(((uint32_t)(p[0] & 0xff) ) |
((uint32_t)(p[1] & 0xff) << 8) |
((uint32_t)(p[2] & 0xff) << 16) |
((uint32_t)(p[3] & 0xff) << 24));
}

/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
static void
U32TO8(unsigned char *p, uint32_t v) {
p[0] = (unsigned char)((v ) & 0xff);
p[1] = (unsigned char)((v >> 8) & 0xff);
p[2] = (unsigned char)((v >> 16) & 0xff);
p[3] = (unsigned char)((v >> 24) & 0xff);
}

static size_t
poly1305_block_size_ref(void) {
return POLY1305_BLOCK_SIZE;
}

static void
poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;

/* bytes_hint not used */
(void)bytes_hint;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff;
st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03;
st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff;
st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff;
st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff;

/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->h[3] = 0;
st->h[4] = 0;

/* save pad for later */
st->pad[0] = U8TO32(&key->b[16]);
st->pad[1] = U8TO32(&key->b[20]);
st->pad[2] = U8TO32(&key->b[24]);
st->pad[3] = U8TO32(&key->b[28]);

st->final = 0;
}

static void
poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
uint32_t r0,r1,r2,r3,r4;
uint32_t s1,s2,s3,s4;
uint32_t h0,h1,h2,h3,h4;
uint64_t d0,d1,d2,d3,d4;
uint32_t c;

r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];
r3 = st->r[3];
r4 = st->r[4];

s1 = r1 * 5;
s2 = r2 * 5;
s3 = r3 * 5;
s4 = r4 * 5;

h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];

while (inlen >= POLY1305_BLOCK_SIZE) {
/* h += m[i] */
h0 += (U8TO32(in+ 0) ) & 0x3ffffff;
h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff;
h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff;
h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff;
h4 += (U8TO32(in+12) >> 8) | hibit;

/* h *= r */
d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1);
d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2);
d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3);
d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4);
d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0);

/* (partial) h %= p */
c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff;
d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff;
d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff;
d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff;
d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff;
h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
h1 += c;

in += POLY1305_BLOCK_SIZE;
inlen -= POLY1305_BLOCK_SIZE;
}

st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
st->h[3] = h3;
st->h[4] = h4;
}

static void
poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
uint32_t h0,h1,h2,h3,h4,c;
uint32_t g0,g1,g2,g3,g4;
uint64_t f;
uint32_t mask;

/* process the remaining block */
if (remaining) {
unsigned char final[POLY1305_BLOCK_SIZE] = {0};
size_t i;
for (i = 0; i < remaining; i++)
final[i] = in[i];
final[remaining] = 1;
st->final = 1;
poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
}

/* fully carry h */
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];

c = h1 >> 26; h1 = h1 & 0x3ffffff;
h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
h1 += c;

/* compute h + -p */
g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
g4 = h4 + c - (1 << 26);

/* select h if h < p, or h + -p if h >= p */
mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1;
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
g4 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
h2 = (h2 & mask) | g2;
h3 = (h3 & mask) | g3;
h4 = (h4 & mask) | g4;

/* h = h % (2^128) */
h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;

/* mac = (h + pad) % (2^128) */
f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f;
f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f;
f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f;
f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f;

U32TO8(mac + 0, h0);
U32TO8(mac + 4, h1);
U32TO8(mac + 8, h2);
U32TO8(mac + 12, h3);

/* zero out the state */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->h[3] = 0;
st->h[4] = 0;
st->r[0] = 0;
st->r[1] = 0;
st->r[2] = 0;
st->r[3] = 0;
st->r[4] = 0;
st->pad[0] = 0;
st->pad[1] = 0;
st->pad[2] = 0;
st->pad[3] = 0;
}

static void
poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
poly1305_state_ref_t st;
size_t blocks;
poly1305_init_ext_ref(&st, key, inlen);
blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
if (blocks) {
poly1305_blocks_ref(&st, in, blocks);
in += blocks;
inlen -= blocks;
}
poly1305_finish_ext_ref(&st, in, inlen, mac);
}


+ 231
- 0
src/libcryptobox/poly1305/ref-64.c View File

@@ -0,0 +1,231 @@
/*
poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition

assumes the existence of uint64_t and uint128_t
*/

#include "config.h"
enum {
POLY1305_BLOCK_SIZE = 16
};

#if defined(_MSC_VER)
#include <intrin.h>

typedef struct uint128_t {
unsigned long long lo;
unsigned long long hi;
} uint128_t;

#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
#endif

#define POLY1305_NOINLINE __attribute__((noinline))
#endif

typedef struct poly1305_state_ref_t {
uint64_t r[3];
uint64_t h[3];
uint64_t pad[2];
unsigned char final;
} poly1305_state_ref_t;

/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
static uint64_t
U8TO64(const unsigned char *p) {
return
((uint64_t)p[0] ) |
((uint64_t)p[1] << 8) |
((uint64_t)p[2] << 16) |
((uint64_t)p[3] << 24) |
((uint64_t)p[4] << 32) |
((uint64_t)p[5] << 40) |
((uint64_t)p[6] << 48) |
((uint64_t)p[7] << 56);
}

/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
static void
U64TO8(unsigned char *p, uint64_t v) {
p[0] = (unsigned char)(v ) & 0xff;
p[1] = (unsigned char)(v >> 8) & 0xff;
p[2] = (unsigned char)(v >> 16) & 0xff;
p[3] = (unsigned char)(v >> 24) & 0xff;
p[4] = (unsigned char)(v >> 32) & 0xff;
p[5] = (unsigned char)(v >> 40) & 0xff;
p[6] = (unsigned char)(v >> 48) & 0xff;
p[7] = (unsigned char)(v >> 56) & 0xff;
}

static size_t
poly1305_block_size_ref(void) {
return POLY1305_BLOCK_SIZE;
}

static void
poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
uint64_t t0, t1;

/* bytes_hint not used */
(void)bytes_hint;

/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
t0 = U8TO64(&key->b[0]);
t1 = U8TO64(&key->b[8]);
st->r[0] = ( t0 ) & 0xffc0fffffff;
st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;

/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;

/* save pad for later */
st->pad[0] = U8TO64(&key->b[16]);
st->pad[1] = U8TO64(&key->b[24]);

st->final = 0;
}

static void
poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */
uint64_t r0,r1,r2;
uint64_t s1,s2;
uint64_t h0,h1,h2;
uint64_t c;
uint128_t d0,d1,d2;

r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];

s1 = r1 * (5 << 2);
s2 = r2 * (5 << 2);

h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];

while (inlen >= POLY1305_BLOCK_SIZE) {
uint64_t t0, t1;

/* h += in[i] */
t0 = U8TO64(in + 0);
t1 = U8TO64(in + 8);
h0 += (( t0 ) & 0xfffffffffff);
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;

/* h *= r */
d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1);
d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2);
d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0);

/* (partial) h %= p */
c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff;
d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff;
d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
h1 += c;

in += POLY1305_BLOCK_SIZE;
inlen -= POLY1305_BLOCK_SIZE;
}

st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
}

static void
poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
uint64_t h0, h1, h2, c;
uint64_t g0, g1, g2;
uint64_t t0, t1;

/* process the remaining block */
if (remaining) {
unsigned char final[POLY1305_BLOCK_SIZE] = {0};
size_t i;
for (i = 0; i < remaining; i++)
final[i] = in[i];
final[remaining] = 1;
st->final = 1;
poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
}

/* fully carry h */
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];

c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c;

/* compute h + -p */
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
g2 = h2 + c - ((uint64_t)1 << 42);

/* select h if h < p, or h + -p if h >= p */
c = (g2 >> 63) - 1;
h0 = (h0 & ~c) | (g0 & c);
h1 = (h1 & ~c) | (g1 & c);
h2 = (h2 & ~c) | (g2 & c);

/* h = (h + pad) */
t0 = st->pad[0];
t1 = st->pad[1];

h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;

/* mac = h % (2^128) */
h0 = ((h0 ) | (h1 << 44));
h1 = ((h1 >> 20) | (h2 << 24));

U64TO8(&mac[0], h0);
U64TO8(&mac[8], h1);

/* zero out the state */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->r[0] = 0;
st->r[1] = 0;
st->r[2] = 0;
st->pad[0] = 0;
st->pad[1] = 0;
}


static void
poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
poly1305_state_ref_t st;
size_t blocks;
poly1305_init_ext_ref(&st, key, inlen);
blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
if (blocks) {
poly1305_blocks_ref(&st, in, blocks);
in += blocks;
inlen -= blocks;
}
poly1305_finish_ext_ref(&st, in, inlen, mac);
}


+ 966
- 0
src/libcryptobox/poly1305/sse2.S View File

@@ -0,0 +1,966 @@
#include "../chacha20/macro.S"
#include "constants.S"
SECTION_TEXT

GLOBAL_HIDDEN_FN poly1305_block_size_sse2
movl $32, %eax
ret
FN_END poly1305_block_size_sse2

GLOBAL_HIDDEN_FN poly1305_init_ext_sse2
poly1305_init_ext_sse2_local:
pushq %r15
xorps %xmm0, %xmm0
testq %rdx, %rdx
pushq %r14
movq %rdx, %r11
movq $-1, %rax
cmove %rax, %r11
pushq %r13
movabsq $17575274610687, %r9
pushq %r12
pushq %rbp
movq %r11, %r13
movabsq $17592186044415, %rbp
pushq %rbx
xorl %ebx, %ebx
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, (%rdi)
movdqu %xmm0, 16(%rdi)
movq 8(%rsi), %rcx
movq (%rsi), %rax
movq %rcx, %rdx
shrq $24, %rcx
andq %rax, %r9
salq $20, %rdx
shrq $44, %rax
movq %r9, %r8
orq %rax, %rdx
shrq $26, %r8
movabsq $17592181915647, %rax
andq %rax, %rdx
movabsq $68719475727, %rax
andq %rax, %rcx
movl %r9d, %eax
andl $67108863, %eax
movl %eax, 40(%rdi)
movl %edx, %eax
sall $18, %eax
orl %r8d, %eax
movq %rdx, %r8
andl $67108863, %eax
shrq $34, %r8
movl %eax, 44(%rdi)
movq %rdx, %rax
shrq $8, %rax
andl $67108863, %eax
movl %eax, 48(%rdi)
movl %ecx, %eax
sall $10, %eax
orl %r8d, %eax
movq %rdi, %r8
andl $67108863, %eax
movl %eax, 52(%rdi)
movq %rcx, %rax
shrq $16, %rax
movl %eax, 56(%rdi)
movq 16(%rsi), %rax
movq %rax, 104(%rdi)
movq 24(%rsi), %rax
movq %rdx, %rsi
movq %rax, 112(%rdi)
poly1305_init_ext_sse2_7:
testq %rbx, %rbx
jne poly1305_init_ext_sse2_4
cmpq $16, %r13
jbe poly1305_init_ext_sse2_5
leaq 60(%r8), %rdi
jmp poly1305_init_ext_sse2_6
poly1305_init_ext_sse2_4:
cmpq $96, %r13
jb poly1305_init_ext_sse2_5
leaq 80(%r8), %rdi
poly1305_init_ext_sse2_6:
imulq $20, %rcx, %r10
movq $0, -48(%rsp)
movq $0, -32(%rsp)
leaq (%rsi,%rsi), %r14
leaq (%r9,%r9), %r11
movq %r10, %rax
mulq %r14
movq %rax, %r14
movq %r9, %rax
movq %rdx, %r15
mulq %r9
addq %rax, %r14
movq %r14, %rax
adcq %rdx, %r15
leaq (%rcx,%rcx), %rdx
andq %rbp, %rax
movq %rax, -16(%rsp)
movq %r11, %rax
movq %rdx, -24(%rsp)
mulq %rsi
movq %rax, %r11
movq %r10, %rax
movq %rdx, %r12
mulq %rcx
movq -16(%rsp), %rcx
addq %rax, %r11
movq %r14, %rax
adcq %rdx, %r12
shrdq $44, %r15, %rax
movq %rax, -56(%rsp)
movq -24(%rsp), %rax
addq -56(%rsp), %r11
adcq -48(%rsp), %r12
mulq %r9
movq %r11, %r14
andq %rbp, %r14
movq %rax, %r9
movq %rsi, %rax
movq %rdx, %r10
mulq %rsi
addq %rax, %r9
movq %r11, %rax
adcq %rdx, %r10
shrdq $44, %r12, %rax
movq %rax, -40(%rsp)
movabsq $4398046511103, %rax
addq -40(%rsp), %r9
adcq -32(%rsp), %r10
andq %r9, %rax
incq %rbx
shrdq $42, %r10, %r9
leaq (%r9,%r9,4), %r9
addq %r9, %rcx
movq %rcx, %r9
shrq $44, %rcx
addq %r14, %rcx
andq %rbp, %r9
movq %rcx, %rsi
shrq $44, %rcx
movq %r9, %rdx
addq %rax, %rcx
movl %r9d, %eax
andq %rbp, %rsi
andl $67108863, %eax
shrq $26, %rdx
movl %eax, (%rdi)
movl %esi, %eax
sall $18, %eax
orl %edx, %eax
movq %rsi, %rdx
andl $67108863, %eax
shrq $34, %rdx
movl %eax, 4(%rdi)
movq %rsi, %rax
shrq $8, %rax
andl $67108863, %eax
movl %eax, 8(%rdi)
movl %ecx, %eax
sall $10, %eax
orl %edx, %eax
andl $67108863, %eax
movl %eax, 12(%rdi)
movq %rcx, %rax
shrq $16, %rax
cmpq $2, %rbx
movl %eax, 16(%rdi)
jne poly1305_init_ext_sse2_7
poly1305_init_ext_sse2_5:
movq $0, 120(%r8)
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
FN_END poly1305_init_ext_sse2


GLOBAL_HIDDEN_FN poly1305_blocks_sse2
poly1305_blocks_sse2_local:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
andq $-64, %rsp
subq $328, %rsp
movq $(1 << 24), %rax
movd %rax, %xmm1
movq $((1 << 26) - 1), %rax
movd %rax, %xmm0
pshufd $68, %xmm1, %xmm1
pshufd $68, %xmm0, %xmm0
movq 120(%rdi), %rax
movaps %xmm1, 312(%rsp)
testb $4, %al
je poly1305_blocks_sse2_11
movaps 312(%rsp), %xmm1
psrldq $8, %xmm1
movaps %xmm1, 312(%rsp)
poly1305_blocks_sse2_11:
testb $8, %al
je poly1305_blocks_sse2_12
xorps %xmm1, %xmm1
movaps %xmm1, 312(%rsp)
poly1305_blocks_sse2_12:
testb $1, %al
jne poly1305_blocks_sse2_13
movq 16(%rsi), %xmm1
movaps %xmm0, %xmm3
movaps %xmm0, %xmm9
movq (%rsi), %xmm15
orq $1, %rax
subq $32, %rdx
movq 8(%rsi), %xmm12
punpcklqdq %xmm1, %xmm15
movq 24(%rsi), %xmm1
movaps %xmm15, %xmm8
pand %xmm15, %xmm3
psrlq $52, %xmm15
addq $32, %rsi
punpcklqdq %xmm1, %xmm12
movaps %xmm12, %xmm1
psrlq $26, %xmm8
psllq $12, %xmm1
pand %xmm0, %xmm8
movq %rax, 120(%rdi)
por %xmm1, %xmm15
psrlq $40, %xmm12
pand %xmm15, %xmm9
por 312(%rsp), %xmm12
psrlq $26, %xmm15
pand %xmm0, %xmm15
jmp poly1305_blocks_sse2_14
poly1305_blocks_sse2_13:
movdqu (%rdi), %xmm8
movdqu 16(%rdi), %xmm15
movdqu 32(%rdi), %xmm12
pshufd $80, %xmm8, %xmm3
pshufd $250, %xmm8, %xmm8
pshufd $80, %xmm15, %xmm9
pshufd $250, %xmm15, %xmm15
pshufd $80, %xmm12, %xmm12
poly1305_blocks_sse2_14:
movq 120(%rdi), %rax
testb $48, %al
je poly1305_blocks_sse2_15
testb $16, %al
movd 56(%rdi), %xmm2
leaq 40(%rdi), %rax
je poly1305_blocks_sse2_16
movdqu 60(%rdi), %xmm1
movdqu (%rax), %xmm4
movd %xmm2, %eax
movd 76(%rdi), %xmm2
movaps %xmm1, %xmm7
movd %eax, %xmm5
punpckldq %xmm4, %xmm7
punpckhdq %xmm4, %xmm1
punpcklqdq %xmm5, %xmm2
jmp poly1305_blocks_sse2_17
poly1305_blocks_sse2_16:
movdqu (%rax), %xmm1
movl $1, %r8d
movd %r8d, %xmm4
movaps %xmm1, %xmm7
punpckldq %xmm4, %xmm7
punpckhdq %xmm4, %xmm1
poly1305_blocks_sse2_17:
pshufd $80, %xmm7, %xmm11
pshufd $80, %xmm1, %xmm4
pshufd $250, %xmm7, %xmm7
movaps %xmm11, 168(%rsp)
pshufd $250, %xmm1, %xmm1
jmp poly1305_blocks_sse2_18
poly1305_blocks_sse2_15:
movdqu 60(%rdi), %xmm1
movd 76(%rdi), %xmm2
pshufd $0, %xmm2, %xmm2
pshufd $0, %xmm1, %xmm11
pshufd $85, %xmm1, %xmm7
pshufd $170, %xmm1, %xmm4
movaps %xmm11, 168(%rsp)
pshufd $255, %xmm1, %xmm1
poly1305_blocks_sse2_18:
movaps %xmm1, %xmm14
movaps %xmm7, %xmm5
movaps %xmm4, %xmm13
movaps %xmm1, 264(%rsp)
movaps %xmm2, %xmm1
cmpq $63, %rdx
movq $(5), %r8
movd %r8, %xmm6
pshufd $68, %xmm6, %xmm6
pmuludq %xmm6, %xmm5
movaps %xmm4, 296(%rsp)
pmuludq %xmm6, %xmm13
movaps %xmm2, 152(%rsp)
pmuludq %xmm6, %xmm14
pmuludq %xmm6, %xmm1
movaps %xmm5, 88(%rsp)
movaps %xmm13, 72(%rsp)
movaps %xmm14, 56(%rsp)
movaps %xmm1, 40(%rsp)
jbe poly1305_blocks_sse2_19
movdqu 80(%rdi), %xmm1
movd 96(%rdi), %xmm2
movq %rdx, %rcx
pshufd $0, %xmm2, %xmm2
movaps %xmm2, 24(%rsp)
pmuludq %xmm6, %xmm2
pshufd $85, %xmm1, %xmm4
movaps %xmm4, 280(%rsp)
pmuludq %xmm6, %xmm4
pshufd $255, %xmm1, %xmm13
pshufd $170, %xmm1, %xmm5
movaps 72(%rsp), %xmm14
movaps %xmm5, 216(%rsp)
pmuludq %xmm6, %xmm5
movq %rsi, %rax
movaps %xmm4, -24(%rsp)
movaps %xmm13, %xmm4
pshufd $0, %xmm1, %xmm1
pmuludq %xmm6, %xmm4
movaps %xmm14, -8(%rsp)
movaps %xmm5, 8(%rsp)
movaps 168(%rsp), %xmm5
movaps %xmm1, 248(%rsp)
movaps 56(%rsp), %xmm1
movaps %xmm4, 120(%rsp)
movaps 40(%rsp), %xmm4
movaps %xmm13, 136(%rsp)
movaps %xmm2, 200(%rsp)
movaps %xmm1, 104(%rsp)
movaps %xmm4, 184(%rsp)
movaps %xmm5, 232(%rsp)
jmp poly1305_blocks_sse2_20
.p2align 6
poly1305_blocks_sse2_20:
movaps -24(%rsp), %xmm5
movaps %xmm8, %xmm13
subq $64, %rcx
movaps 8(%rsp), %xmm4
movaps 120(%rsp), %xmm10
pmuludq %xmm12, %xmm5
pmuludq %xmm15, %xmm4
movaps 8(%rsp), %xmm2
pmuludq %xmm9, %xmm10
movaps 120(%rsp), %xmm11
movaps 200(%rsp), %xmm14
pmuludq %xmm12, %xmm2
paddq %xmm4, %xmm5
pmuludq %xmm15, %xmm11
movaps 120(%rsp), %xmm1
paddq %xmm10, %xmm5
pmuludq %xmm8, %xmm14
movaps 200(%rsp), %xmm10
movaps 200(%rsp), %xmm4
pmuludq %xmm12, %xmm1
movaps 248(%rsp), %xmm8
pmuludq %xmm15, %xmm10
paddq %xmm11, %xmm2
pmuludq %xmm12, %xmm4
paddq %xmm14, %xmm5
movaps 200(%rsp), %xmm11
movaps 248(%rsp), %xmm14
pmuludq %xmm15, %xmm8
pmuludq 248(%rsp), %xmm12
pmuludq %xmm9, %xmm11
paddq %xmm10, %xmm1
movaps 248(%rsp), %xmm10
pmuludq 280(%rsp), %xmm15
pmuludq %xmm3, %xmm14
paddq %xmm15, %xmm12
paddq %xmm8, %xmm4
pmuludq %xmm13, %xmm10
movq 24(%rax), %xmm15
movaps 248(%rsp), %xmm8
paddq %xmm11, %xmm2
movaps %xmm3, %xmm11
movaps 280(%rsp), %xmm3
paddq %xmm14, %xmm5
pmuludq %xmm9, %xmm8
paddq %xmm10, %xmm2
movq 16(%rax), %xmm14
movaps 280(%rsp), %xmm10
pmuludq %xmm9, %xmm3
pmuludq 216(%rsp), %xmm9
paddq %xmm9, %xmm12
paddq %xmm8, %xmm1
movq (%rax), %xmm8
pmuludq %xmm11, %xmm10
paddq %xmm3, %xmm4
movaps 216(%rsp), %xmm3
punpcklqdq %xmm14, %xmm8
movaps 280(%rsp), %xmm14
pmuludq %xmm13, %xmm3
paddq %xmm10, %xmm2
movq 8(%rax), %xmm10
pmuludq %xmm13, %xmm14
pmuludq 136(%rsp), %xmm13
paddq %xmm13, %xmm12
punpcklqdq %xmm15, %xmm10
movaps %xmm10, %xmm9
movaps 216(%rsp), %xmm15
paddq %xmm3, %xmm4
psllq $12, %xmm9
movaps %xmm0, %xmm3
paddq %xmm14, %xmm1
pmuludq %xmm11, %xmm15
pand %xmm8, %xmm3
movaps 136(%rsp), %xmm14
movaps %xmm3, -40(%rsp)
movaps %xmm8, %xmm3
movdqu 48(%rax), %xmm13
psrlq $52, %xmm8
pmuludq %xmm11, %xmm14
paddq %xmm15, %xmm1
por %xmm9, %xmm8
pmuludq 24(%rsp), %xmm11
paddq %xmm11, %xmm12
movdqu 32(%rax), %xmm11
movaps %xmm10, %xmm9
psrlq $40, %xmm10
pand %xmm0, %xmm8
movaps %xmm11, %xmm15
paddq %xmm14, %xmm4
xorps %xmm14, %xmm14
punpckldq %xmm13, %xmm15
psrlq $14, %xmm9
addq $64, %rax
pand %xmm0, %xmm9
psrlq $26, %xmm3
cmpq $63, %rcx
por 312(%rsp), %xmm10
movaps %xmm13, -72(%rsp)
movaps %xmm15, %xmm13
punpckldq %xmm14, %xmm13
punpckhdq -72(%rsp), %xmm11
movaps %xmm13, -56(%rsp)
movaps %xmm11, %xmm13
punpckhdq %xmm14, %xmm11
pand %xmm0, %xmm3
psllq $18, %xmm11
punpckhdq %xmm14, %xmm15
punpckldq %xmm14, %xmm13
paddq %xmm11, %xmm4
movaps -8(%rsp), %xmm11
psllq $6, %xmm15
psllq $12, %xmm13
movaps 88(%rsp), %xmm14
paddq %xmm15, %xmm2
pmuludq %xmm10, %xmm11
paddq %xmm13, %xmm1
movaps -8(%rsp), %xmm13
pmuludq %xmm10, %xmm14
paddq -56(%rsp), %xmm5
paddq 312(%rsp), %xmm12
pmuludq %xmm9, %xmm13
movaps 104(%rsp), %xmm15
paddq %xmm11, %xmm2
movaps 184(%rsp), %xmm11
paddq %xmm14, %xmm5
movaps 104(%rsp), %xmm14
pmuludq %xmm9, %xmm15
pmuludq %xmm10, %xmm11
paddq %xmm13, %xmm5
movaps 104(%rsp), %xmm13
pmuludq %xmm10, %xmm14
pmuludq 232(%rsp), %xmm10
paddq %xmm10, %xmm12
pmuludq %xmm8, %xmm13
paddq %xmm15, %xmm2
movaps %xmm8, %xmm10
paddq %xmm11, %xmm4
pmuludq %xmm7, %xmm10
movaps 232(%rsp), %xmm11
movaps 184(%rsp), %xmm15
paddq %xmm14, %xmm1
pmuludq %xmm9, %xmm11
paddq %xmm13, %xmm5
movaps 184(%rsp), %xmm13
movaps 184(%rsp), %xmm14
pmuludq %xmm3, %xmm15
pmuludq %xmm9, %xmm13
paddq %xmm11, %xmm4
pmuludq %xmm8, %xmm14
movaps 232(%rsp), %xmm11
paddq %xmm10, %xmm4
paddq %xmm15, %xmm5
pmuludq %xmm7, %xmm9
pmuludq %xmm8, %xmm11
paddq %xmm13, %xmm1
movaps 232(%rsp), %xmm13
movaps 296(%rsp), %xmm10
paddq %xmm14, %xmm2
pmuludq 296(%rsp), %xmm8
movaps -40(%rsp), %xmm14
pmuludq %xmm3, %xmm13
paddq %xmm9, %xmm12
paddq %xmm11, %xmm1
movaps %xmm3, %xmm11
paddq %xmm8, %xmm12
movaps 232(%rsp), %xmm15
pmuludq %xmm7, %xmm11
pmuludq %xmm3, %xmm10
paddq %xmm13, %xmm2
movaps %xmm14, %xmm13
movaps 296(%rsp), %xmm9
pmuludq %xmm14, %xmm15
pmuludq 264(%rsp), %xmm3
paddq %xmm11, %xmm1
pmuludq %xmm7, %xmm13
paddq %xmm3, %xmm12
movaps 264(%rsp), %xmm11
paddq %xmm10, %xmm4
pmuludq %xmm14, %xmm9
paddq %xmm15, %xmm5
pmuludq %xmm14, %xmm11
movaps %xmm5, %xmm8
paddq %xmm13, %xmm2
psrlq $26, %xmm8
paddq %xmm9, %xmm1
pand %xmm0, %xmm5
pmuludq 152(%rsp), %xmm14
paddq %xmm14, %xmm12
paddq %xmm8, %xmm2
paddq %xmm11, %xmm4
movaps %xmm2, %xmm9
movaps %xmm2, %xmm8
movaps %xmm4, %xmm3
psrlq $26, %xmm9
pand %xmm0, %xmm4
psrlq $26, %xmm3
paddq %xmm9, %xmm1
pand %xmm0, %xmm8
paddq %xmm3, %xmm12
movaps %xmm1, %xmm10
movaps %xmm1, %xmm9
movaps %xmm12, %xmm3
psrlq $26, %xmm10
pand %xmm0, %xmm12
psrlq $26, %xmm3
paddq %xmm10, %xmm4
pand %xmm0, %xmm9
pmuludq %xmm6, %xmm3
movaps %xmm4, %xmm1
movaps %xmm4, %xmm15
psrlq $26, %xmm1
pand %xmm0, %xmm15
paddq %xmm1, %xmm12
paddq %xmm3, %xmm5
movaps %xmm5, %xmm2
movaps %xmm5, %xmm3
psrlq $26, %xmm2
pand %xmm0, %xmm3
paddq %xmm2, %xmm8
ja poly1305_blocks_sse2_20
leaq -64(%rdx), %rax
andl $63, %edx
andq $-64, %rax
leaq 64(%rsi,%rax), %rsi
poly1305_blocks_sse2_19:
cmpq $31, %rdx
jbe poly1305_blocks_sse2_21
movaps 56(%rsp), %xmm11
movaps %xmm15, %xmm1
movaps %xmm15, %xmm14
movaps 72(%rsp), %xmm5
movaps %xmm12, %xmm4
movaps %xmm15, %xmm10
movaps 88(%rsp), %xmm2
pmuludq %xmm11, %xmm14
movaps %xmm8, %xmm15
pmuludq %xmm5, %xmm1
movaps 40(%rsp), %xmm13
testq %rsi, %rsi
pmuludq %xmm12, %xmm2
pmuludq %xmm12, %xmm5
pmuludq %xmm11, %xmm4
paddq %xmm1, %xmm2
pmuludq %xmm9, %xmm11
movaps %xmm12, %xmm1
paddq %xmm14, %xmm5
pmuludq %xmm13, %xmm15
movaps %xmm9, %xmm14
pmuludq %xmm13, %xmm14
pmuludq %xmm13, %xmm1
paddq %xmm11, %xmm2
movaps 168(%rsp), %xmm11
pmuludq %xmm10, %xmm13
paddq %xmm15, %xmm2
movaps %xmm9, %xmm15
paddq %xmm14, %xmm5
pmuludq %xmm11, %xmm12
movaps %xmm3, %xmm14
pmuludq %xmm11, %xmm14
movaps %xmm13, 248(%rsp)
movaps %xmm10, %xmm13
pmuludq %xmm7, %xmm15
paddq 248(%rsp), %xmm4
pmuludq %xmm11, %xmm13
pmuludq %xmm7, %xmm10
paddq %xmm14, %xmm2
movaps %xmm13, 280(%rsp)
movaps %xmm8, %xmm13
pmuludq %xmm11, %xmm13
paddq %xmm10, %xmm12
movaps 296(%rsp), %xmm10
paddq 280(%rsp), %xmm1
pmuludq %xmm9, %xmm11
pmuludq 296(%rsp), %xmm9
pmuludq %xmm3, %xmm10
paddq %xmm9, %xmm12
paddq %xmm13, %xmm5
movaps %xmm3, %xmm13
paddq %xmm15, %xmm1
pmuludq %xmm7, %xmm13
paddq %xmm11, %xmm4
movaps 296(%rsp), %xmm11
pmuludq %xmm8, %xmm7
pmuludq %xmm8, %xmm11
pmuludq 264(%rsp), %xmm8
paddq %xmm8, %xmm12
paddq %xmm13, %xmm5
paddq %xmm7, %xmm4
movaps 264(%rsp), %xmm7
paddq %xmm11, %xmm1
paddq %xmm10, %xmm4
pmuludq %xmm3, %xmm7
pmuludq 152(%rsp), %xmm3
paddq %xmm3, %xmm12
paddq %xmm7, %xmm1
je poly1305_blocks_sse2_22
movdqu (%rsi), %xmm7
xorps %xmm3, %xmm3
paddq 312(%rsp), %xmm12
movdqu 16(%rsi), %xmm8
movaps %xmm7, %xmm9
punpckldq %xmm8, %xmm9
punpckhdq %xmm8, %xmm7
movaps %xmm9, %xmm10
movaps %xmm7, %xmm8
punpckldq %xmm3, %xmm10
punpckhdq %xmm3, %xmm9
punpckhdq %xmm3, %xmm7
punpckldq %xmm3, %xmm8
movaps %xmm8, %xmm3
psllq $6, %xmm9
paddq %xmm10, %xmm2
psllq $12, %xmm3
paddq %xmm9, %xmm5
psllq $18, %xmm7
paddq %xmm3, %xmm4
paddq %xmm7, %xmm1
poly1305_blocks_sse2_22:
movaps %xmm2, %xmm8
movaps %xmm1, %xmm3
movaps %xmm1, %xmm15
psrlq $26, %xmm8
pand %xmm0, %xmm2
pand %xmm0, %xmm15
psrlq $26, %xmm3
paddq %xmm5, %xmm8
paddq %xmm12, %xmm3
movaps %xmm8, %xmm9
pand %xmm0, %xmm8
movaps %xmm3, %xmm1
psrlq $26, %xmm9
movaps %xmm3, %xmm12
psrlq $26, %xmm1
paddq %xmm4, %xmm9
pand %xmm0, %xmm12
pmuludq %xmm1, %xmm6
movaps %xmm9, %xmm3
pand %xmm0, %xmm9
psrlq $26, %xmm3
paddq %xmm3, %xmm15
paddq %xmm6, %xmm2
movaps %xmm15, %xmm3
pand %xmm0, %xmm15
movaps %xmm2, %xmm1
psrlq $26, %xmm3
psrlq $26, %xmm1
paddq %xmm3, %xmm12
movaps %xmm0, %xmm3
paddq %xmm1, %xmm8
pand %xmm2, %xmm3
poly1305_blocks_sse2_21:
testq %rsi, %rsi
je poly1305_blocks_sse2_23
pshufd $8, %xmm3, %xmm3
pshufd $8, %xmm8, %xmm8
pshufd $8, %xmm9, %xmm9
pshufd $8, %xmm15, %xmm15
pshufd $8, %xmm12, %xmm12
punpcklqdq %xmm8, %xmm3
punpcklqdq %xmm15, %xmm9
movdqu %xmm3, (%rdi)
movdqu %xmm9, 16(%rdi)
movq %xmm12, 32(%rdi)
jmp poly1305_blocks_sse2_10
poly1305_blocks_sse2_23:
movaps %xmm3, %xmm0
movaps %xmm8, %xmm4
movaps %xmm9, %xmm2
psrldq $8, %xmm0
movaps %xmm15, %xmm10
paddq %xmm0, %xmm3
psrldq $8, %xmm4
movaps %xmm12, %xmm0
movd %xmm3, %edx
paddq %xmm4, %xmm8
psrldq $8, %xmm2
movl %edx, %ecx
movd %xmm8, %eax
paddq %xmm2, %xmm9
shrl $26, %ecx
psrldq $8, %xmm10
andl $67108863, %edx
addl %ecx, %eax
movd %xmm9, %ecx
paddq %xmm10, %xmm15
movl %eax, %r9d
shrl $26, %eax
psrldq $8, %xmm0
addl %ecx, %eax
movd %xmm15, %ecx
paddq %xmm0, %xmm12
movl %eax, %esi
andl $67108863, %r9d
movd %xmm12, %r10d
shrl $26, %esi
andl $67108863, %eax
addl %ecx, %esi
salq $8, %rax
movl %r9d, %ecx
shrl $18, %r9d
movl %esi, %r8d
shrl $26, %esi
andl $67108863, %r8d
addl %r10d, %esi
orq %r9, %rax
salq $16, %rsi
movq %r8, %r9
shrl $10, %r8d
salq $26, %rcx
orq %r8, %rsi
salq $34, %r9
orq %rdx, %rcx
movq %rsi, %r11
shrq $42, %rsi
movabsq $17592186044415, %rdx
orq %r9, %rax
movabsq $4398046511103, %r8
andq %rdx, %rcx
andq %rdx, %rax
andq %r8, %r11
leaq (%rsi,%rsi,4), %rsi
addq %rsi, %rcx
movq %rcx, %r10
shrq $44, %rcx
addq %rcx, %rax
andq %rdx, %r10
movq %rax, %r9
shrq $44, %rax
addq %r11, %rax
andq %rdx, %r9
movabsq $-4398046511104, %r11
movq %rax, %rcx
andq %r8, %rcx
shrq $42, %rax
leaq (%rax,%rax,4), %rsi
addq %rcx, %r11
addq %r10, %rsi
movq %rsi, %r8
shrq $44, %rsi
andq %rdx, %r8
addq %r9, %rsi
leaq 5(%r8), %r9
movq %r9, %rbx
andq %rdx, %r9
shrq $44, %rbx
addq %rsi, %rbx
movq %rbx, %rax
andq %rbx, %rdx
shrq $44, %rax
addq %rax, %r11
movq %r11, %rax
shrq $63, %rax
decq %rax
movq %rax, %r10
andq %rax, %r9
andq %rax, %rdx
notq %r10
andq %r11, %rax
andq %r10, %r8
andq %r10, %rsi
andq %r10, %rcx
orq %r9, %r8
orq %rdx, %rsi
orq %rax, %rcx
movq %r8, (%rdi)
movq %rsi, 8(%rdi)
movq %rcx, 16(%rdi)
poly1305_blocks_sse2_10:
movq -8(%rbp), %rbx
leave
ret
FN_END poly1305_blocks_sse2

GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2
poly1305_finish_ext_sse2_local:
pushq %r12
movq %rcx, %r12
pushq %rbp
movq %rdx, %rbp
pushq %rbx
movq %rdi, %rbx
subq $32, %rsp
testq %rdx, %rdx
je poly1305_finish_ext_sse2_27
xorl %eax, %eax
movq %rsp, %rdi
movl $8, %ecx
rep stosl
subq %rsp, %rsi
testb $16, %dl
movq %rsp, %rax
je poly1305_finish_ext_sse2_28
movdqu (%rsp,%rsi), %xmm0
addq $16, %rax
movaps %xmm0, (%rsp)
poly1305_finish_ext_sse2_28:
testb $8, %bpl
je poly1305_finish_ext_sse2_29
movq (%rax,%rsi), %rdx
movq %rdx, (%rax)
addq $8, %rax
poly1305_finish_ext_sse2_29:
testb $4, %bpl
je poly1305_finish_ext_sse2_30
movl (%rax,%rsi), %edx
movl %edx, (%rax)
addq $4, %rax
poly1305_finish_ext_sse2_30:
testb $2, %bpl
je poly1305_finish_ext_sse2_31
movw (%rax,%rsi), %dx
movw %dx, (%rax)
addq $2, %rax
poly1305_finish_ext_sse2_31:
testb $1, %bpl
je poly1305_finish_ext_sse2_32
movb (%rax,%rsi), %dl
movb %dl, (%rax)
poly1305_finish_ext_sse2_32:
cmpq $16, %rbp
je poly1305_finish_ext_sse2_33
movb $1, (%rsp,%rbp)
poly1305_finish_ext_sse2_33:
cmpq $16, %rbp
movl $32, %edx
movq %rsp, %rsi
sbbq %rax, %rax
movq %rbx, %rdi
andl $4, %eax
addq $4, %rax
orq %rax, 120(%rbx)
call poly1305_blocks_sse2_local
poly1305_finish_ext_sse2_27:
movq 120(%rbx), %rax
testb $1, %al
je poly1305_finish_ext_sse2_35
decq %rbp
cmpq $15, %rbp
jbe poly1305_finish_ext_sse2_36
orq $16, %rax
jmp poly1305_finish_ext_sse2_40
poly1305_finish_ext_sse2_36:
orq $32, %rax
poly1305_finish_ext_sse2_40:
movq %rax, 120(%rbx)
movl $32, %edx
xorl %esi, %esi
movq %rbx, %rdi
call poly1305_blocks_sse2_local
poly1305_finish_ext_sse2_35:
movq 8(%rbx), %rax
movq 112(%rbx), %rsi
movq %rax, %rdx
movq %rax, %rcx
movq 16(%rbx), %rax
shrq $20, %rcx
salq $44, %rdx
orq (%rbx), %rdx
salq $24, %rax
orq %rcx, %rax
movq 104(%rbx), %rcx
addq %rcx, %rdx
adcq %rsi, %rax
xorps %xmm0, %xmm0
movdqu %xmm0, (%rbx)
movdqu %xmm0, 16(%rbx)
movdqu %xmm0, 32(%rbx)
movdqu %xmm0, 48(%rbx)
movdqu %xmm0, 64(%rbx)
movdqu %xmm0, 80(%rbx)
movdqu %xmm0, 96(%rbx)
movdqu %xmm0, 112(%rbx)
movq %rdx, (%r12)
movq %rax, 8(%r12)
addq $32, %rsp
popq %rbx
popq %rbp
popq %r12
ret
FN_END poly1305_finish_ext_sse2

GLOBAL_HIDDEN_FN poly1305_auth_sse2
cmpq $128, %rdx
jb poly1305_auth_x86_local
pushq %rbp
movq %rsp, %rbp
pushq %r14
pushq %r13
movq %rdi, %r13
pushq %r12
movq %rsi, %r12
movq %rcx, %rsi
pushq %rbx
movq %rdx, %rbx
andq $-64, %rsp
movq %rbx, %r14
addq $-128, %rsp
movq %rsp, %rdi
call poly1305_init_ext_sse2_local
andq $-32, %r14
je poly1305_auth_sse2_42
movq %r12, %rsi
movq %r14, %rdx
movq %rsp, %rdi
call poly1305_blocks_sse2_local
addq %r14, %r12
subq %r14, %rbx
poly1305_auth_sse2_42:
movq %r13, %rcx
movq %rbx, %rdx
movq %r12, %rsi
movq %rsp, %rdi
call poly1305_finish_ext_sse2_local
leaq -32(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %rbp
ret
FN_END poly1305_auth_sse2






Loading…
Cancel
Save