@@ -4,7 +4,7 @@ INCLUDE(AsmOp.cmake) | |||
TARGET_ARCHITECTURE(ARCH) | |||
SET(CHACHASRC chacha20/chacha.c chacha20/ref.c) | |||
SET(POLYSRC poly1305/poly1305-donna.c) | |||
SET(POLYSRC poly1305/poly1305.c) | |||
# For now we support only x86_64 architecture with optimizations | |||
IF(${ARCH} STREQUAL "x86_64") | |||
@@ -12,35 +12,44 @@ IF(${ARCH} STREQUAL "x86_64") | |||
ASM_OP(HAVE_AVX2 "vpaddq %ymm0, %ymm0, %ymm0" "avx2") | |||
ASM_OP(HAVE_AVX "vpaddq %xmm0, %xmm0, %xmm0" "avx") | |||
ASM_OP(HAVE_SSE2 "pmuludq %xmm0, %xmm0" "sse2") | |||
ASM_OP(HAVE_SLASHMACRO " | |||
.macro TEST1 op | |||
\\op %eax, %eax | |||
.endm | |||
TEST1 xorl | |||
" "slash macro convention") | |||
ASM_OP(HAVE_DOLLARMACRO " | |||
.macro TEST1 op | |||
$0 %eax, %eax | |||
.endm | |||
TEST1 xorl | |||
" "dollar macro convention") | |||
CONFIGURE_FILE(platform_config.h.in platform_config.h) | |||
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}") | |||
SET(CURVESRC curve25519/curve25519-donna-c64.c) | |||
SET(POLYSRC ${POLYSRC} poly1305/ref-64.c) | |||
ELSEIF(${ARCH} STREQUAL "i386") | |||
SET(POLYSRC ${POLYSRC} poly1305/ref-32.c) | |||
SET(CURVESRC curve25519/curve25519-donna.c) | |||
ELSE() | |||
SET(CURVESRC curve25519/ref.c) | |||
SET(POLYSRC ${POLYSRC} poly1305/ref-32.c) | |||
ENDIF() | |||
IF(HAVE_AVX2) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/avx2.S) | |||
SET(POLYSRC ${POLYSRC} poly1305/avx2.S) | |||
ENDIF(HAVE_AVX2) | |||
IF(HAVE_AVX) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/avx.S) | |||
SET(POLYSRC ${POLYSRC} poly1305/avx.S) | |||
ENDIF(HAVE_AVX) | |||
IF(HAVE_SSE2) | |||
SET(CHACHASRC ${CHACHASRC} chacha20/sse2.S) | |||
SET(POLYSRC ${POLYSRC} poly1305/sse2.S) | |||
ENDIF(HAVE_SSE2) | |||
SET(LIBCRYPTOBOXSRC cryptobox.c) |
@@ -1,4 +1,5 @@ | |||
#include "macro.S" | |||
#include "constants.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN chacha_blocks_avx |
@@ -1,4 +1,5 @@ | |||
#include "macro.S" | |||
#include "constants.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN chacha_blocks_avx2 |
@@ -1,4 +1,5 @@ | |||
/* Copyright (c) 2015, Vsevolod Stakhov | |||
* Copyright (c) 2015, Andrew Moon | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without |
@@ -0,0 +1,6 @@ | |||
SECTION_RODATA | |||
.p2align 4,,15 | |||
chacha_constants: | |||
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */ | |||
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ | |||
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ |
@@ -176,10 +176,3 @@ | |||
#endif | |||
#endif | |||
.endm | |||
SECTION_RODATA | |||
.p2align 4,,15 | |||
chacha_constants: | |||
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */ | |||
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ | |||
.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ |
@@ -1,5 +1,5 @@ | |||
#include "macro.S" | |||
#include "constants.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN chacha_blocks_sse2 |
@@ -24,7 +24,7 @@ | |||
#include "cryptobox.h" | |||
#include "platform_config.h" | |||
#include "chacha20/chacha.h" | |||
#include "poly1305/poly1305-donna.h" | |||
#include "poly1305/poly1305.h" | |||
#include "curve25519/curve25519.h" | |||
#include "ottery.h" | |||
@@ -105,6 +105,7 @@ rspamd_cryptobox_init (void) | |||
} | |||
chacha_load (); | |||
poly1305_load (); | |||
} | |||
void | |||
@@ -139,7 +140,7 @@ void rspamd_cryptobox_encrypt_nm_inplace (guchar *data, gsize len, | |||
const rspamd_nonce_t nonce, | |||
const rspamd_nm_t nm, rspamd_sig_t sig) | |||
{ | |||
poly1305_context mac_ctx; | |||
poly1305_state mac_ctx; | |||
guchar subkey[CHACHA_BLOCKBYTES]; | |||
chacha_state s; | |||
gsize r; | |||
@@ -151,7 +152,7 @@ void rspamd_cryptobox_encrypt_nm_inplace (guchar *data, gsize len, | |||
r = chacha_update (&s, data, data, len); | |||
chacha_final (&s, data + r); | |||
poly1305_init (&mac_ctx, subkey); | |||
poly1305_init (&mac_ctx, (const poly1305_key *)subkey); | |||
poly1305_update (&mac_ctx, data, len); | |||
poly1305_finish (&mac_ctx, sig); | |||
@@ -163,7 +164,7 @@ gboolean | |||
rspamd_cryptobox_decrypt_nm_inplace (guchar *data, gsize len, | |||
const rspamd_nonce_t nonce, const rspamd_nm_t nm, const rspamd_sig_t sig) | |||
{ | |||
poly1305_context mac_ctx; | |||
poly1305_state mac_ctx; | |||
guchar subkey[CHACHA_BLOCKBYTES]; | |||
rspamd_sig_t mac; | |||
chacha_state s; | |||
@@ -175,7 +176,7 @@ rspamd_cryptobox_decrypt_nm_inplace (guchar *data, gsize len, | |||
memset (subkey, 0, sizeof (subkey)); | |||
chacha_update (&s, subkey, subkey, sizeof (subkey)); | |||
poly1305_init (&mac_ctx, subkey); | |||
poly1305_init (&mac_ctx, (const poly1305_key *)subkey); | |||
poly1305_update (&mac_ctx, data, len); | |||
poly1305_finish (&mac_ctx, mac); | |||
@@ -1,112 +0,0 @@ | |||
"A state-of-the-art message-authentication code" | |||
# ABOUT | |||
See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf) | |||
These are quite portable implementations of increasing efficiency depending on the size of the multiplier available. | |||
Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt) | |||
# BUILDING | |||
## Default | |||
If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based | |||
on what it can tell the compiler supports | |||
gcc poly1305-donna.c -O3 -o poly1305.o | |||
## Selecting a specific version | |||
gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS | |||
Where `-DPOLY1305_XXBITS` is one of | |||
* `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions | |||
* `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions | |||
* `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions | |||
* `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions | |||
8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due | |||
to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which | |||
is not the intention of this project. | |||
# USAGE | |||
See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized: | |||
The poly1305_auth function, viewed as a function of the message for a uniform random key, is | |||
designed to meet the standard notion of unforgeability after a single message. After the sender | |||
authenticates one message, an attacker cannot find authenticators for any other messages. | |||
The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key. | |||
Authenticators for two messages under the same key should be expected to reveal enough information | |||
to allow forgeries of authenticators on other messages. | |||
## Functions | |||
`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support | |||
every underlying platform specific implementation. It should be size_t aligned, which should be handled already | |||
with the size_t member `aligner`. | |||
`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);` | |||
where | |||
`key` is the 32 byte key that is **only used for this message and is discarded immediately after** | |||
`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);` | |||
where `m` is a pointer to the message fragment to be processed, and | |||
`bytes` is the length of the message fragment | |||
`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);` | |||
where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying | |||
implementation will zero out `ctx`. | |||
`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);` | |||
where `mac` is the buffer which receives the 16 byte authenticator, | |||
`m` is a pointer to the message to be processed, | |||
`bytes` is the number of bytes in the message, and | |||
`key` is the 32 byte key that is **only used for this message and is discarded immediately after**. | |||
`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);` | |||
where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not | |||
`int poly1305_power_on_self_test(void);` | |||
tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if | |||
any tests fail. | |||
## Example | |||
### Simple | |||
#include "poly1305-donna.h" | |||
unsigned char key[32] = {...}, mac[16]; | |||
unsigned char msg[] = {...}; | |||
poly1305_auth(mac, msg, msglen, key); | |||
### Full | |||
[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing | |||
the correct results, compute an authenticator, and test it against an expected value. | |||
# LICENSE | |||
[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN | |||
# NAMESAKE | |||
I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence | |||
the name. |
@@ -0,0 +1,875 @@ | |||
#include "../chacha20/macro.S" | |||
#include "constants.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0 | |||
movl $32, %eax | |||
ret | |||
FN_END poly1305_block_size_avx | |||
GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1 | |||
poly1305_init_ext_avx_local: | |||
pushq %r15 | |||
pushq %r14 | |||
pushq %r13 | |||
pushq %r12 | |||
pushq %rbp | |||
pushq %rbx | |||
movq %rdi, %rbp | |||
testq %rdx, %rdx | |||
movq $-1, %rax | |||
cmovne %rdx, %rax | |||
movq %rax, -16(%rsp) | |||
vpxor %xmm0, %xmm0, %xmm0 | |||
vmovdqu %xmm0, (%rdi) | |||
vmovdqu %xmm0, 16(%rdi) | |||
vmovdqu %xmm0, 32(%rdi) | |||
movq (%rsi), %r9 | |||
movq 8(%rsi), %r8 | |||
movabsq $17575274610687, %r10 | |||
andq %r9, %r10 | |||
shrq $44, %r9 | |||
movq %r8, %rax | |||
salq $20, %rax | |||
orq %rax, %r9 | |||
movabsq $17592181915647, %rax | |||
andq %rax, %r9 | |||
shrq $24, %r8 | |||
movabsq $68719475727, %rax | |||
andq %rax, %r8 | |||
leaq 40(%rdi), %r15 | |||
movl %r10d, %eax | |||
andl $67108863, %eax | |||
movl %eax, 40(%rdi) | |||
movl %r9d, %edx | |||
sall $18, %edx | |||
movq %r10, %rax | |||
shrq $26, %rax | |||
orl %edx, %eax | |||
andl $67108863, %eax | |||
movl %eax, 44(%rdi) | |||
movq %r9, %rax | |||
shrq $8, %rax | |||
andl $67108863, %eax | |||
movl %eax, 48(%rdi) | |||
movq %r9, %rax | |||
shrq $34, %rax | |||
movl %r8d, %edx | |||
sall $10, %edx | |||
orl %edx, %eax | |||
andl $67108863, %eax | |||
movl %eax, 52(%rdi) | |||
movq %r8, %rax | |||
shrq $16, %rax | |||
movl %eax, 56(%rdi) | |||
movq 16(%rsi), %rax | |||
movq %rax, 104(%rdi) | |||
movq 24(%rsi), %rax | |||
movq %rax, 112(%rdi) | |||
movl $0, %ebx | |||
.L7: | |||
testq %rbx, %rbx | |||
jne .L4 | |||
leaq 60(%rbp), %r15 | |||
cmpq $16, -16(%rsp) | |||
ja .L6 | |||
jmp .L5 | |||
.L4: | |||
cmpq $1, %rbx | |||
jne .L6 | |||
leaq 80(%rbp), %r15 | |||
cmpq $95, -16(%rsp) | |||
jbe .L5 | |||
.L6: | |||
leaq (%r8,%r8,4), %rsi | |||
salq $2, %rsi | |||
leaq (%r9,%r9), %rdi | |||
movq %rdi, %rax | |||
mulq %rsi | |||
movq %rax, %r13 | |||
movq %rdx, %r14 | |||
movq %r10, %rax | |||
mulq %r10 | |||
addq %r13, %rax | |||
adcq %r14, %rdx | |||
movabsq $17592186044415, %rcx | |||
movq %rax, -72(%rsp) | |||
movq %rdx, -64(%rsp) | |||
andq -72(%rsp), %rcx | |||
leaq (%r10,%r10), %r11 | |||
movq %r11, %rax | |||
mulq %r9 | |||
movq %rax, %r11 | |||
movq %rdx, %r12 | |||
movq %rsi, %rax | |||
mulq %r8 | |||
movq %rax, %r13 | |||
movq %rdx, %r14 | |||
addq %r11, %r13 | |||
adcq %r12, %r14 | |||
movq -72(%rsp), %rax | |||
movq -64(%rsp), %rdx | |||
shrdq $44, %rdx, %rax | |||
movq %rax, -56(%rsp) | |||
movq $0, -48(%rsp) | |||
addq -56(%rsp), %r13 | |||
adcq -48(%rsp), %r14 | |||
movabsq $17592186044415, %rsi | |||
andq %r13, %rsi | |||
leaq (%r8,%r8), %rdi | |||
movq %rdi, %rax | |||
mulq %r10 | |||
movq %rax, %r11 | |||
movq %rdx, %r12 | |||
movq %r9, %rax | |||
mulq %r9 | |||
addq %r11, %rax | |||
adcq %r12, %rdx | |||
shrdq $44, %r14, %r13 | |||
movq %r13, -40(%rsp) | |||
movq $0, -32(%rsp) | |||
addq -40(%rsp), %rax | |||
adcq -32(%rsp), %rdx | |||
movabsq $4398046511103, %rdi | |||
andq %rax, %rdi | |||
shrdq $42, %rdx, %rax | |||
leaq (%rax,%rax,4), %r8 | |||
addq %rcx, %r8 | |||
movabsq $17592186044415, %r10 | |||
andq %r8, %r10 | |||
shrq $44, %r8 | |||
addq %rsi, %r8 | |||
movabsq $17592186044415, %r9 | |||
andq %r8, %r9 | |||
shrq $44, %r8 | |||
addq %rdi, %r8 | |||
movl %r10d, %eax | |||
andl $67108863, %eax | |||
movl %eax, (%r15) | |||
movl %r9d, %edx | |||
sall $18, %edx | |||
movq %r10, %rax | |||
shrq $26, %rax | |||
orl %edx, %eax | |||
andl $67108863, %eax | |||
movl %eax, 4(%r15) | |||
movq %r9, %rax | |||
shrq $8, %rax | |||
andl $67108863, %eax | |||
movl %eax, 8(%r15) | |||
movl %r8d, %edx | |||
sall $10, %edx | |||
movq %r9, %rax | |||
shrq $34, %rax | |||
orl %edx, %eax | |||
andl $67108863, %eax | |||
movl %eax, 12(%r15) | |||
movq %r8, %rax | |||
shrq $16, %rax | |||
movl %eax, 16(%r15) | |||
addq $1, %rbx | |||
cmpq $2, %rbx | |||
jne .L7 | |||
.L5: | |||
movq $0, 120(%rbp) | |||
popq %rbx | |||
popq %rbp | |||
popq %r12 | |||
popq %r13 | |||
popq %r14 | |||
popq %r15 | |||
ret | |||
FN_END poly1305_init_ext_avx | |||
GLOBAL_HIDDEN_FN poly1305_blocks_avx | |||
poly1305_blocks_avx_local: | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
pushq %rbx | |||
andq $-64, %rsp | |||
subq $200, %rsp | |||
movl $(1 << 24), %eax | |||
movl $((1 << 26) - 1), %r8d | |||
movl $(5), %r9d | |||
vmovd %eax, %xmm1 | |||
vmovd %r8d, %xmm0 | |||
vmovd %r9d, %xmm2 | |||
vpshufd $68, %xmm1, %xmm1 | |||
vpshufd $68, %xmm0, %xmm0 | |||
vpshufd $68, %xmm2, %xmm2 | |||
vmovdqa %xmm1, 152(%rsp) | |||
vmovdqa %xmm2, 184(%rsp) | |||
movq 120(%rdi), %rax | |||
testb $4, %al | |||
je .L12 | |||
vpsrldq $8, %xmm1, %xmm1 | |||
vmovdqa %xmm1, 152(%rsp) | |||
.L12: | |||
testb $8, %al | |||
je .L13 | |||
vpxor %xmm1, %xmm1, %xmm1 | |||
vmovdqa %xmm1, 152(%rsp) | |||
.L13: | |||
testb $1, %al | |||
jne .L14 | |||
vmovq (%rsi), %xmm1 | |||
vpinsrq $1, 16(%rsi), %xmm1, %xmm1 | |||
vmovq 8(%rsi), %xmm3 | |||
vpinsrq $1, 24(%rsi), %xmm3, %xmm2 | |||
vpand %xmm0, %xmm1, %xmm7 | |||
vpsrlq $26, %xmm1, %xmm12 | |||
vpand %xmm0, %xmm12, %xmm12 | |||
vpsllq $12, %xmm2, %xmm3 | |||
vpsrlq $52, %xmm1, %xmm1 | |||
vpor %xmm3, %xmm1, %xmm6 | |||
vpand %xmm0, %xmm6, %xmm3 | |||
vpsrlq $26, %xmm6, %xmm6 | |||
vpand %xmm0, %xmm6, %xmm6 | |||
vpsrlq $40, %xmm2, %xmm2 | |||
vpor 152(%rsp), %xmm2, %xmm2 | |||
addq $32, %rsi | |||
subq $32, %rdx | |||
orq $1, %rax | |||
movq %rax, 120(%rdi) | |||
jmp .L15 | |||
.L14: | |||
vmovdqu (%rdi), %xmm12 | |||
vmovdqu 16(%rdi), %xmm6 | |||
vmovdqu 32(%rdi), %xmm2 | |||
vpshufd $80, %xmm12, %xmm7 | |||
vpshufd $250, %xmm12, %xmm12 | |||
vpshufd $80, %xmm6, %xmm3 | |||
vpshufd $250, %xmm6, %xmm6 | |||
vpshufd $80, %xmm2, %xmm2 | |||
.L15: | |||
movq 120(%rdi), %rax | |||
testb $48, %al | |||
je .L16 | |||
testb $16, %al | |||
je .L17 | |||
vmovdqu 40(%rdi), %xmm1 | |||
vmovd 56(%rdi), %xmm4 | |||
vmovdqu 60(%rdi), %xmm5 | |||
vpunpckldq %xmm1, %xmm5, %xmm11 | |||
vpunpckhdq %xmm1, %xmm5, %xmm5 | |||
vmovd 76(%rdi), %xmm1 | |||
vpunpcklqdq %xmm4, %xmm1, %xmm4 | |||
jmp .L18 | |||
.L17: | |||
movl $(1), %r8d | |||
vmovdqu 40(%rdi), %xmm5 | |||
vmovd 56(%rdi), %xmm4 | |||
vmovd %r8d, %xmm1 | |||
vpunpckldq %xmm1, %xmm5, %xmm11 | |||
vpunpckhdq %xmm1, %xmm5, %xmm5 | |||
.L18: | |||
vpshufd $80, %xmm11, %xmm1 | |||
vpshufd $250, %xmm11, %xmm11 | |||
vpshufd $80, %xmm5, %xmm10 | |||
vpshufd $250, %xmm5, %xmm5 | |||
jmp .L19 | |||
.L16: | |||
vmovdqu 60(%rdi), %xmm5 | |||
vpshufd $0, %xmm5, %xmm1 | |||
vpshufd $85, %xmm5, %xmm11 | |||
vpshufd $170, %xmm5, %xmm10 | |||
vpshufd $255, %xmm5, %xmm5 | |||
vmovd 76(%rdi), %xmm4 | |||
vpshufd $0, %xmm4, %xmm4 | |||
.L19: | |||
vmovdqa %xmm11, 136(%rsp) | |||
vpmuludq 184(%rsp), %xmm11, %xmm13 | |||
vmovdqa %xmm13, 120(%rsp) | |||
vmovdqa %xmm10, 104(%rsp) | |||
vpmuludq 184(%rsp), %xmm10, %xmm13 | |||
vmovdqa %xmm13, 88(%rsp) | |||
vmovdqa %xmm5, 72(%rsp) | |||
vpmuludq 184(%rsp), %xmm5, %xmm5 | |||
vmovdqa %xmm5, 56(%rsp) | |||
vmovdqa %xmm4, 40(%rsp) | |||
vpmuludq 184(%rsp), %xmm4, %xmm4 | |||
vmovdqa %xmm4, 24(%rsp) | |||
cmpq $63, %rdx | |||
jbe .L20 | |||
vmovdqu 80(%rdi), %xmm4 | |||
vpshufd $0, %xmm4, %xmm5 | |||
vmovdqa %xmm5, 8(%rsp) | |||
vpshufd $85, %xmm4, %xmm5 | |||
vmovdqa %xmm5, -8(%rsp) | |||
vpshufd $170, %xmm4, %xmm13 | |||
vmovdqa %xmm13, -24(%rsp) | |||
vpshufd $255, %xmm4, %xmm4 | |||
vmovdqa %xmm4, %xmm10 | |||
vmovdqa %xmm4, -40(%rsp) | |||
vmovd 96(%rdi), %xmm4 | |||
vpshufd $0, %xmm4, %xmm4 | |||
vmovdqa %xmm4, %xmm8 | |||
vmovdqa %xmm4, -56(%rsp) | |||
vpmuludq 184(%rsp), %xmm5, %xmm4 | |||
vmovdqa %xmm4, -72(%rsp) | |||
vpmuludq 184(%rsp), %xmm13, %xmm4 | |||
vmovdqa %xmm4, -88(%rsp) | |||
vpmuludq 184(%rsp), %xmm10, %xmm4 | |||
vmovdqa %xmm4, -104(%rsp) | |||
vpmuludq 184(%rsp), %xmm8, %xmm4 | |||
vmovdqa %xmm4, -120(%rsp) | |||
leaq 32(%rsi), %rax | |||
movq %rdx, %rcx | |||
vmovdqa %xmm1, 168(%rsp) | |||
jmp .L22 | |||
.p2align 6 | |||
nop | |||
nop | |||
nop | |||
nop | |||
.L22: | |||
vpmuludq -72(%rsp), %xmm2, %xmm13 | |||
vmovdqa -88(%rsp), %xmm5 | |||
vpmuludq %xmm5, %xmm6, %xmm4 | |||
vpmuludq %xmm5, %xmm2, %xmm11 | |||
vmovdqa -104(%rsp), %xmm9 | |||
vpmuludq %xmm9, %xmm6, %xmm5 | |||
vpmuludq %xmm9, %xmm2, %xmm10 | |||
vpaddq %xmm4, %xmm13, %xmm13 | |||
vpmuludq %xmm9, %xmm3, %xmm4 | |||
vmovdqa -120(%rsp), %xmm8 | |||
vpmuludq %xmm8, %xmm2, %xmm9 | |||
vpaddq %xmm5, %xmm11, %xmm11 | |||
vmovdqa %xmm8, %xmm5 | |||
vpmuludq %xmm8, %xmm12, %xmm8 | |||
vpmuludq %xmm5, %xmm3, %xmm14 | |||
vpaddq %xmm4, %xmm13, %xmm13 | |||
vpmuludq %xmm5, %xmm6, %xmm4 | |||
vmovdqa 8(%rsp), %xmm15 | |||
vpmuludq %xmm15, %xmm6, %xmm5 | |||
vpaddq %xmm8, %xmm13, %xmm13 | |||
vpmuludq %xmm15, %xmm2, %xmm8 | |||
vpaddq %xmm14, %xmm11, %xmm11 | |||
vpmuludq %xmm15, %xmm7, %xmm14 | |||
vpaddq %xmm4, %xmm10, %xmm10 | |||
vpmuludq %xmm15, %xmm12, %xmm4 | |||
vpaddq %xmm5, %xmm9, %xmm9 | |||
vpmuludq %xmm15, %xmm3, %xmm5 | |||
vmovdqa -8(%rsp), %xmm15 | |||
vpmuludq %xmm15, %xmm3, %xmm2 | |||
vpaddq %xmm14, %xmm13, %xmm13 | |||
vpmuludq %xmm15, %xmm6, %xmm6 | |||
vpaddq %xmm4, %xmm11, %xmm11 | |||
vpmuludq %xmm15, %xmm7, %xmm4 | |||
vpaddq %xmm5, %xmm10, %xmm10 | |||
vmovq -32(%rax), %xmm5 | |||
vpinsrq $1, -16(%rax), %xmm5, %xmm5 | |||
vpmuludq %xmm15, %xmm12, %xmm14 | |||
vpaddq %xmm2, %xmm9, %xmm9 | |||
vmovdqa -24(%rsp), %xmm2 | |||
vpmuludq %xmm2, %xmm12, %xmm15 | |||
vpaddq %xmm6, %xmm8, %xmm8 | |||
vpmuludq %xmm2, %xmm3, %xmm3 | |||
vpaddq %xmm4, %xmm11, %xmm11 | |||
vmovq -24(%rax), %xmm4 | |||
vpinsrq $1, -8(%rax), %xmm4, %xmm6 | |||
vpmuludq %xmm2, %xmm7, %xmm4 | |||
vpaddq %xmm14, %xmm10, %xmm10 | |||
vmovdqa -40(%rsp), %xmm1 | |||
vpmuludq %xmm1, %xmm7, %xmm14 | |||
vpaddq %xmm15, %xmm9, %xmm9 | |||
vpand %xmm5, %xmm0, %xmm2 | |||
vpmuludq %xmm1, %xmm12, %xmm12 | |||
vpaddq %xmm3, %xmm8, %xmm8 | |||
vpsrlq $26, %xmm5, %xmm3 | |||
vpand %xmm3, %xmm0, %xmm3 | |||
vpmuludq -56(%rsp), %xmm7, %xmm7 | |||
vpaddq %xmm4, %xmm10, %xmm10 | |||
vpsllq $12, %xmm6, %xmm15 | |||
vpsrlq $52, %xmm5, %xmm4 | |||
vpor %xmm15, %xmm4, %xmm4 | |||
vpaddq %xmm14, %xmm9, %xmm9 | |||
vpsrlq $14, %xmm6, %xmm5 | |||
vpand %xmm5, %xmm0, %xmm5 | |||
vpaddq %xmm12, %xmm8, %xmm8 | |||
vpand %xmm4, %xmm0, %xmm4 | |||
vpaddq %xmm7, %xmm8, %xmm8 | |||
vpsrlq $40, %xmm6, %xmm6 | |||
vpor 152(%rsp), %xmm6, %xmm6 | |||
vmovdqu (%rax), %xmm12 | |||
vmovdqu 16(%rax), %xmm7 | |||
vpunpckldq %xmm7, %xmm12, %xmm15 | |||
vpunpckhdq %xmm7, %xmm12, %xmm7 | |||
vpxor %xmm14, %xmm14, %xmm14 | |||
vpunpckldq %xmm14, %xmm15, %xmm12 | |||
vpunpckhdq %xmm14, %xmm15, %xmm15 | |||
vpunpckldq %xmm14, %xmm7, %xmm14 | |||
vpxor %xmm1, %xmm1, %xmm1 | |||
vpunpckhdq %xmm1, %xmm7, %xmm7 | |||
vpsllq $6, %xmm15, %xmm15 | |||
vpsllq $12, %xmm14, %xmm14 | |||
vpsllq $18, %xmm7, %xmm7 | |||
vpaddq %xmm12, %xmm13, %xmm12 | |||
vpaddq %xmm15, %xmm11, %xmm15 | |||
vpaddq %xmm14, %xmm10, %xmm14 | |||
vpaddq %xmm7, %xmm9, %xmm7 | |||
vpaddq 152(%rsp), %xmm8, %xmm8 | |||
vpmuludq 120(%rsp), %xmm6, %xmm13 | |||
vmovdqa 88(%rsp), %xmm10 | |||
vpmuludq %xmm10, %xmm5, %xmm9 | |||
vpmuludq %xmm10, %xmm6, %xmm11 | |||
vmovdqa 56(%rsp), %xmm1 | |||
vpmuludq %xmm1, %xmm5, %xmm10 | |||
vpaddq %xmm13, %xmm12, %xmm12 | |||
vpmuludq %xmm1, %xmm6, %xmm13 | |||
vpaddq %xmm9, %xmm12, %xmm12 | |||
vpmuludq %xmm1, %xmm4, %xmm9 | |||
vpaddq %xmm11, %xmm15, %xmm15 | |||
vmovdqa 24(%rsp), %xmm1 | |||
vpmuludq %xmm1, %xmm6, %xmm11 | |||
vpaddq %xmm10, %xmm15, %xmm10 | |||
vpmuludq %xmm1, %xmm3, %xmm15 | |||
vpaddq %xmm13, %xmm14, %xmm14 | |||
vpmuludq %xmm1, %xmm4, %xmm13 | |||
vpaddq %xmm9, %xmm12, %xmm9 | |||
vpmuludq %xmm1, %xmm5, %xmm12 | |||
vpaddq %xmm11, %xmm7, %xmm7 | |||
vpmuludq 168(%rsp), %xmm5, %xmm11 | |||
vpaddq %xmm15, %xmm9, %xmm9 | |||
vpmuludq 168(%rsp), %xmm6, %xmm6 | |||
vpaddq %xmm13, %xmm10, %xmm10 | |||
vpmuludq 168(%rsp), %xmm2, %xmm15 | |||
vpaddq %xmm12, %xmm14, %xmm14 | |||
vpmuludq 168(%rsp), %xmm3, %xmm13 | |||
vpaddq %xmm11, %xmm7, %xmm11 | |||
vpmuludq 168(%rsp), %xmm4, %xmm12 | |||
vpaddq %xmm6, %xmm8, %xmm6 | |||
vmovdqa 136(%rsp), %xmm8 | |||
vpmuludq %xmm8, %xmm4, %xmm7 | |||
vpaddq %xmm15, %xmm9, %xmm9 | |||
vpmuludq %xmm8, %xmm5, %xmm5 | |||
vpaddq %xmm13, %xmm10, %xmm10 | |||
vpmuludq %xmm8, %xmm2, %xmm15 | |||
vpaddq %xmm12, %xmm14, %xmm14 | |||
vpmuludq %xmm8, %xmm3, %xmm8 | |||
vpaddq %xmm7, %xmm11, %xmm11 | |||
vmovdqa 104(%rsp), %xmm7 | |||
vpmuludq %xmm7, %xmm3, %xmm13 | |||
vpaddq %xmm5, %xmm6, %xmm6 | |||
vpmuludq %xmm7, %xmm4, %xmm4 | |||
vpaddq %xmm15, %xmm10, %xmm10 | |||
vpmuludq %xmm7, %xmm2, %xmm15 | |||
vpaddq %xmm8, %xmm14, %xmm14 | |||
vmovdqa 72(%rsp), %xmm5 | |||
vpmuludq %xmm5, %xmm2, %xmm7 | |||
vpaddq %xmm13, %xmm11, %xmm11 | |||
vpmuludq %xmm5, %xmm3, %xmm3 | |||
vpaddq %xmm4, %xmm6, %xmm6 | |||
vpmuludq 40(%rsp), %xmm2, %xmm2 | |||
vpaddq %xmm15, %xmm14, %xmm14 | |||
vpaddq %xmm7, %xmm11, %xmm11 | |||
vpaddq %xmm3, %xmm6, %xmm6 | |||
vpaddq %xmm2, %xmm6, %xmm2 | |||
vpsrlq $26, %xmm9, %xmm12 | |||
vpsrlq $26, %xmm11, %xmm5 | |||
vpand %xmm0, %xmm9, %xmm9 | |||
vpand %xmm0, %xmm11, %xmm11 | |||
vpaddq %xmm12, %xmm10, %xmm10 | |||
vpaddq %xmm5, %xmm2, %xmm2 | |||
vpsrlq $26, %xmm10, %xmm3 | |||
vpsrlq $26, %xmm2, %xmm7 | |||
vpand %xmm0, %xmm10, %xmm10 | |||
vpand %xmm0, %xmm2, %xmm2 | |||
vpaddq %xmm3, %xmm14, %xmm3 | |||
vpmuludq 184(%rsp), %xmm7, %xmm7 | |||
vpaddq %xmm7, %xmm9, %xmm9 | |||
vpsrlq $26, %xmm3, %xmm6 | |||
vpsrlq $26, %xmm9, %xmm12 | |||
vpand %xmm0, %xmm3, %xmm3 | |||
vpand %xmm0, %xmm9, %xmm7 | |||
vpaddq %xmm6, %xmm11, %xmm6 | |||
vpaddq %xmm12, %xmm10, %xmm12 | |||
vpsrlq $26, %xmm6, %xmm8 | |||
vpand %xmm0, %xmm6, %xmm6 | |||
vpaddq %xmm8, %xmm2, %xmm2 | |||
subq $64, %rcx | |||
addq $64, %rax | |||
cmpq $63, %rcx | |||
ja .L22 | |||
vmovdqa 168(%rsp), %xmm1 | |||
leaq -64(%rdx), %rax | |||
andq $-64, %rax | |||
leaq 64(%rsi,%rax), %rsi | |||
andl $63, %edx | |||
.L20: | |||
cmpq $31, %rdx | |||
jbe .L23 | |||
vpmuludq 120(%rsp), %xmm2, %xmm11 | |||
vmovdqa 88(%rsp), %xmm4 | |||
vpmuludq %xmm4, %xmm6, %xmm0 | |||
vpmuludq %xmm4, %xmm2, %xmm10 | |||
vmovdqa 56(%rsp), %xmm4 | |||
vpmuludq %xmm4, %xmm6, %xmm8 | |||
vpmuludq %xmm4, %xmm2, %xmm5 | |||
vpaddq %xmm0, %xmm11, %xmm11 | |||
vpmuludq %xmm4, %xmm3, %xmm0 | |||
vmovdqa 24(%rsp), %xmm13 | |||
vpmuludq %xmm13, %xmm2, %xmm4 | |||
vpaddq %xmm8, %xmm10, %xmm10 | |||
vpmuludq %xmm13, %xmm12, %xmm8 | |||
vpmuludq %xmm13, %xmm3, %xmm9 | |||
vpaddq %xmm0, %xmm11, %xmm11 | |||
vpmuludq %xmm13, %xmm6, %xmm13 | |||
vpmuludq %xmm1, %xmm6, %xmm0 | |||
vpaddq %xmm8, %xmm11, %xmm8 | |||
vpmuludq %xmm1, %xmm2, %xmm2 | |||
vpaddq %xmm9, %xmm10, %xmm9 | |||
vpmuludq %xmm1, %xmm7, %xmm11 | |||
vpaddq %xmm13, %xmm5, %xmm5 | |||
vpmuludq %xmm1, %xmm12, %xmm10 | |||
vpaddq %xmm0, %xmm4, %xmm0 | |||
vpmuludq %xmm1, %xmm3, %xmm1 | |||
vmovdqa 136(%rsp), %xmm4 | |||
vpmuludq %xmm4, %xmm3, %xmm14 | |||
vpaddq %xmm11, %xmm8, %xmm11 | |||
vpmuludq %xmm4, %xmm6, %xmm6 | |||
vpaddq %xmm10, %xmm9, %xmm9 | |||
vpmuludq %xmm4, %xmm7, %xmm15 | |||
vpaddq %xmm1, %xmm5, %xmm5 | |||
vpmuludq %xmm4, %xmm12, %xmm1 | |||
vpaddq %xmm14, %xmm0, %xmm0 | |||
vmovdqa 104(%rsp), %xmm4 | |||
vpmuludq %xmm4, %xmm12, %xmm8 | |||
vpaddq %xmm6, %xmm2, %xmm2 | |||
vpmuludq %xmm4, %xmm3, %xmm3 | |||
vpaddq %xmm15, %xmm9, %xmm9 | |||
vpmuludq %xmm4, %xmm7, %xmm10 | |||
vpaddq %xmm1, %xmm5, %xmm1 | |||
vmovdqa 72(%rsp), %xmm4 | |||
vpmuludq %xmm4, %xmm7, %xmm15 | |||
vpaddq %xmm8, %xmm0, %xmm0 | |||
vpmuludq %xmm4, %xmm12, %xmm12 | |||
vpaddq %xmm3, %xmm2, %xmm2 | |||
vpmuludq 40(%rsp), %xmm7, %xmm7 | |||
vpaddq %xmm10, %xmm1, %xmm1 | |||
vpaddq %xmm15, %xmm0, %xmm0 | |||
vpaddq %xmm12, %xmm2, %xmm2 | |||
vpaddq %xmm7, %xmm2, %xmm2 | |||
movl $((1 << 26) - 1), %r8d | |||
testq %rsi, %rsi | |||
vmovd %r8d, %xmm15 | |||
je .L24 | |||
vmovdqu (%rsi), %xmm4 | |||
vmovdqu 16(%rsi), %xmm3 | |||
vpunpckldq %xmm3, %xmm4, %xmm5 | |||
vpunpckhdq %xmm3, %xmm4, %xmm3 | |||
vpxor %xmm4, %xmm4, %xmm4 | |||
vpunpckldq %xmm4, %xmm5, %xmm7 | |||
vpunpckhdq %xmm4, %xmm5, %xmm5 | |||
vpunpckldq %xmm4, %xmm3, %xmm6 | |||
vpunpckhdq %xmm4, %xmm3, %xmm3 | |||
vpsllq $6, %xmm5, %xmm5 | |||
vpsllq $12, %xmm6, %xmm6 | |||
vpsllq $18, %xmm3, %xmm3 | |||
vpaddq %xmm7, %xmm11, %xmm11 | |||
vpaddq %xmm5, %xmm9, %xmm9 | |||
vpaddq %xmm6, %xmm1, %xmm1 | |||
vpaddq %xmm3, %xmm0, %xmm0 | |||
vpaddq 152(%rsp), %xmm2, %xmm2 | |||
.L24: | |||
vpshufd $68, %xmm15, %xmm15 | |||
vpsrlq $26, %xmm11, %xmm12 | |||
vpsrlq $26, %xmm0, %xmm3 | |||
vpand %xmm15, %xmm11, %xmm11 | |||
vpand %xmm15, %xmm0, %xmm6 | |||
vpaddq %xmm12, %xmm9, %xmm9 | |||
vpaddq %xmm3, %xmm2, %xmm2 | |||
vpsrlq $26, %xmm9, %xmm3 | |||
vpsrlq $26, %xmm2, %xmm7 | |||
vpand %xmm15, %xmm9, %xmm9 | |||
vpand %xmm15, %xmm2, %xmm2 | |||
vpaddq %xmm3, %xmm1, %xmm3 | |||
vpmuludq 184(%rsp), %xmm7, %xmm7 | |||
vpaddq %xmm7, %xmm11, %xmm7 | |||
vpsrlq $26, %xmm3, %xmm4 | |||
vpsrlq $26, %xmm7, %xmm1 | |||
vpand %xmm15, %xmm3, %xmm3 | |||
vpand %xmm15, %xmm7, %xmm7 | |||
vpaddq %xmm4, %xmm6, %xmm6 | |||
vpaddq %xmm1, %xmm9, %xmm12 | |||
vpsrlq $26, %xmm6, %xmm0 | |||
vpand %xmm15, %xmm6, %xmm6 | |||
vpaddq %xmm0, %xmm2, %xmm2 | |||
.L23: | |||
testq %rsi, %rsi | |||
je .L25 | |||
vpshufd $8, %xmm7, %xmm7 | |||
vpshufd $8, %xmm12, %xmm12 | |||
vpshufd $8, %xmm3, %xmm3 | |||
vpshufd $8, %xmm6, %xmm6 | |||
vpshufd $8, %xmm2, %xmm2 | |||
vpunpcklqdq %xmm12, %xmm7, %xmm7 | |||
vpunpcklqdq %xmm6, %xmm3, %xmm3 | |||
vmovdqu %xmm7, (%rdi) | |||
vmovdqu %xmm3, 16(%rdi) | |||
vmovq %xmm2, 32(%rdi) | |||
jmp .L11 | |||
.L25: | |||
vpsrldq $8, %xmm7, %xmm0 | |||
vpaddq %xmm0, %xmm7, %xmm7 | |||
vpsrldq $8, %xmm12, %xmm0 | |||
vpaddq %xmm0, %xmm12, %xmm12 | |||
vpsrldq $8, %xmm3, %xmm0 | |||
vpaddq %xmm0, %xmm3, %xmm3 | |||
vpsrldq $8, %xmm6, %xmm0 | |||
vpaddq %xmm0, %xmm6, %xmm6 | |||
vpsrldq $8, %xmm2, %xmm0 | |||
vpaddq %xmm0, %xmm2, %xmm2 | |||
vmovd %xmm7, %eax | |||
vmovd %xmm12, %edx | |||
movl %eax, %r9d | |||
shrl $26, %r9d | |||
addl %edx, %r9d | |||
movl %r9d, %r8d | |||
andl $67108863, %r8d | |||
vmovd %xmm3, %edx | |||
shrl $26, %r9d | |||
addl %edx, %r9d | |||
vmovd %xmm6, %edx | |||
movl %r9d, %ecx | |||
shrl $26, %ecx | |||
addl %edx, %ecx | |||
movl %ecx, %esi | |||
andl $67108863, %esi | |||
vmovd %xmm2, %r10d | |||
movl %r8d, %r11d | |||
salq $26, %r11 | |||
andl $67108863, %eax | |||
orq %rax, %r11 | |||
movabsq $17592186044415, %rax | |||
andq %rax, %r11 | |||
andl $67108863, %r9d | |||
salq $8, %r9 | |||
shrl $18, %r8d | |||
movl %r8d, %r8d | |||
orq %r8, %r9 | |||
movq %rsi, %rdx | |||
salq $34, %rdx | |||
orq %rdx, %r9 | |||
andq %rax, %r9 | |||
shrl $26, %ecx | |||
addl %r10d, %ecx | |||
salq $16, %rcx | |||
shrl $10, %esi | |||
movl %esi, %esi | |||
orq %rsi, %rcx | |||
movabsq $4398046511103, %r10 | |||
movq %rcx, %r8 | |||
andq %r10, %r8 | |||
shrq $42, %rcx | |||
leaq (%rcx,%rcx,4), %rdx | |||
addq %r11, %rdx | |||
movq %rdx, %rsi | |||
andq %rax, %rsi | |||
shrq $44, %rdx | |||
addq %r9, %rdx | |||
movq %rdx, %rcx | |||
andq %rax, %rcx | |||
shrq $44, %rdx | |||
addq %r8, %rdx | |||
andq %rdx, %r10 | |||
shrq $42, %rdx | |||
leaq (%rsi,%rdx,4), %rsi | |||
leaq (%rsi,%rdx), %r11 | |||
movq %r11, %rbx | |||
andq %rax, %rbx | |||
shrq $44, %r11 | |||
addq %rcx, %r11 | |||
leaq 5(%rbx), %r9 | |||
movq %r9, %r8 | |||
shrq $44, %r8 | |||
addq %r11, %r8 | |||
movabsq $-4398046511104, %rsi | |||
addq %r10, %rsi | |||
movq %r8, %rdx | |||
shrq $44, %rdx | |||
addq %rdx, %rsi | |||
movq %rsi, %rdx | |||
shrq $63, %rdx | |||
subq $1, %rdx | |||
movq %rdx, %rcx | |||
notq %rcx | |||
andq %rcx, %rbx | |||
andq %rcx, %r11 | |||
andq %r10, %rcx | |||
andq %rax, %r9 | |||
andq %rdx, %r9 | |||
orq %r9, %rbx | |||
movq %rbx, (%rdi) | |||
andq %r8, %rax | |||
andq %rdx, %rax | |||
orq %rax, %r11 | |||
movq %r11, 8(%rdi) | |||
andq %rsi, %rdx | |||
orq %rcx, %rdx | |||
movq %rdx, 16(%rdi) | |||
.L11: | |||
movq -8(%rbp), %rbx | |||
leave | |||
ret | |||
FN_END poly1305_blocks_avx | |||
GLOBAL_HIDDEN_FN poly1305_finish_ext_avx | |||
poly1305_finish_ext_avx_local: | |||
pushq %r12 | |||
pushq %rbp | |||
pushq %rbx | |||
subq $32, %rsp | |||
movq %rdi, %rbx | |||
movq %rdx, %rbp | |||
movq %rcx, %r12 | |||
testq %rdx, %rdx | |||
je .L30 | |||
movq $0, (%rsp) | |||
movq $0, 8(%rsp) | |||
movq $0, 16(%rsp) | |||
movq $0, 24(%rsp) | |||
movq %rsp, %rax | |||
subq %rsp, %rsi | |||
testb $16, %dl | |||
je .L31 | |||
vmovdqu (%rsp,%rsi), %xmm0 | |||
vmovdqa %xmm0, (%rsp) | |||
addq $16, %rax | |||
.L31: | |||
testb $8, %bpl | |||
je .L32 | |||
movq (%rax,%rsi), %rdx | |||
movq %rdx, (%rax) | |||
addq $8, %rax | |||
.L32: | |||
testb $4, %bpl | |||
je .L33 | |||
movl (%rax,%rsi), %edx | |||
movl %edx, (%rax) | |||
addq $4, %rax | |||
.L33: | |||
testb $2, %bpl | |||
je .L34 | |||
movzwl (%rax,%rsi), %edx | |||
movw %dx, (%rax) | |||
addq $2, %rax | |||
.L34: | |||
testb $1, %bpl | |||
je .L35 | |||
movzbl (%rax,%rsi), %edx | |||
movb %dl, (%rax) | |||
.L35: | |||
cmpq $16, %rbp | |||
je .L36 | |||
movb $1, (%rsp,%rbp) | |||
movq 120(%rbx), %rdx | |||
cmpq $16, %rbp | |||
sbbq %rax, %rax | |||
andl $4, %eax | |||
addq $4, %rax | |||
.L37: | |||
orq %rdx, %rax | |||
movq %rax, 120(%rbx) | |||
movq %rsp, %rsi | |||
movl $32, %edx | |||
movq %rbx, %rdi | |||
call poly1305_blocks_avx_local | |||
.L30: | |||
movq 120(%rbx), %rax | |||
testb $1, %al | |||
je .L38 | |||
subq $1, %rbp | |||
cmpq $15, %rbp | |||
jbe .L39 | |||
orq $16, %rax | |||
movq %rax, 120(%rbx) | |||
jmp .L40 | |||
.L39: | |||
orq $32, %rax | |||
movq %rax, 120(%rbx) | |||
.L40: | |||
movl $32, %edx | |||
movl $0, %esi | |||
movq %rbx, %rdi | |||
call poly1305_blocks_avx_local | |||
.L38: | |||
movq 8(%rbx), %rax | |||
movq %rax, %rdx | |||
salq $44, %rdx | |||
orq (%rbx), %rdx | |||
shrq $20, %rax | |||
movq 16(%rbx), %rcx | |||
salq $24, %rcx | |||
orq %rcx, %rax | |||
movq 104(%rbx), %rcx | |||
movq 112(%rbx), %rsi | |||
addq %rcx, %rdx | |||
adcq %rsi, %rax | |||
vpxor %xmm0, %xmm0, %xmm0 | |||
vmovdqu %xmm0, (%rbx) | |||
vmovdqu %xmm0, 16(%rbx) | |||
vmovdqu %xmm0, 32(%rbx) | |||
vmovdqu %xmm0, 48(%rbx) | |||
vmovdqu %xmm0, 64(%rbx) | |||
vmovdqu %xmm0, 80(%rbx) | |||
vmovdqu %xmm0, 96(%rbx) | |||
vmovdqu %xmm0, 112(%rbx) | |||
movq %rdx, (%r12) | |||
movq %rax, 8(%r12) | |||
jmp .L43 | |||
.L36: | |||
movq 120(%rbx), %rdx | |||
movl $4, %eax | |||
jmp .L37 | |||
.L43: | |||
addq $32, %rsp | |||
popq %rbx | |||
popq %rbp | |||
popq %r12 | |||
ret | |||
FN_END poly1305_finish_ext_avx | |||
GLOBAL_HIDDEN_FN poly1305_auth_avx | |||
cmp $128, %rdx | |||
jb poly1305_auth_x86_local | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
pushq %r14 | |||
pushq %r13 | |||
pushq %r12 | |||
pushq %rbx | |||
andq $-64, %rsp | |||
addq $-128, %rsp | |||
movq %rdi, %r14 | |||
movq %rsi, %r12 | |||
movq %rdx, %rbx | |||
movq %rsp, %rdi | |||
movq %rcx, %rsi | |||
call poly1305_init_ext_avx_local | |||
movq %rbx, %r13 | |||
andq $-32, %r13 | |||
je .L46 | |||
movq %rsp, %rdi | |||
movq %r13, %rdx | |||
movq %r12, %rsi | |||
call poly1305_blocks_avx_local | |||
addq %r13, %r12 | |||
subq %r13, %rbx | |||
.L46: | |||
movq %rsp, %rdi | |||
movq %r14, %rcx | |||
movq %rbx, %rdx | |||
movq %r12, %rsi | |||
call poly1305_finish_ext_avx_local | |||
leaq -32(%rbp), %rsp | |||
popq %rbx | |||
popq %r12 | |||
popq %r13 | |||
popq %r14 | |||
popq %rbp | |||
ret | |||
FN_END poly1305_auth_avx |
@@ -0,0 +1,21 @@ | |||
SECTION_RODATA | |||
.p2align 4 | |||
poly1305_constants_x86: | |||
/* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000 | |||
/* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000 | |||
/* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000 | |||
/* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000 | |||
/* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000 | |||
/* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000 | |||
/* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000 | |||
/* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000 | |||
/* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000 | |||
/* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000 | |||
/* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000 | |||
/* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000 | |||
/* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe | |||
/* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001 | |||
/* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001 | |||
/* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001 | |||
/* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003 |
@@ -1,202 +0,0 @@ | |||
/* | |||
poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition | |||
*/ | |||
#if defined(_MSC_VER) | |||
#define POLY1305_NOINLINE __declspec(noinline) | |||
#elif defined(__GNUC__) | |||
#define POLY1305_NOINLINE __attribute__((noinline)) | |||
#else | |||
#define POLY1305_NOINLINE | |||
#endif | |||
#define poly1305_block_size 16 | |||
/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */ | |||
typedef struct poly1305_state_internal_t { | |||
unsigned char buffer[poly1305_block_size]; | |||
size_t leftover; | |||
unsigned short r[10]; | |||
unsigned short h[10]; | |||
unsigned short pad[8]; | |||
unsigned char final; | |||
} poly1305_state_internal_t; | |||
/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */ | |||
static unsigned short | |||
U8TO16(const unsigned char *p) { | |||
return | |||
(((unsigned short)(p[0] & 0xff) ) | | |||
((unsigned short)(p[1] & 0xff) << 8)); | |||
} | |||
/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */ | |||
static void | |||
U16TO8(unsigned char *p, unsigned short v) { | |||
p[0] = (v ) & 0xff; | |||
p[1] = (v >> 8) & 0xff; | |||
} | |||
void | |||
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
unsigned short t0,t1,t2,t3,t4,t5,t6,t7; | |||
size_t i; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff; | |||
t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff; | |||
t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03; | |||
t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff; | |||
t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff; | |||
st->r[5] = ((t4 >> 1) ) & 0x1ffe; | |||
t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff; | |||
t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81; | |||
t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff; | |||
st->r[9] = ((t7 >> 5) ) & 0x007f; | |||
/* h = 0 */ | |||
for (i = 0; i < 10; i++) | |||
st->h[i] = 0; | |||
/* save pad for later */ | |||
for (i = 0; i < 8; i++) | |||
st->pad[i] = U8TO16(&key[16 + (2 * i)]); | |||
st->leftover = 0; | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { | |||
const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */ | |||
unsigned short t0,t1,t2,t3,t4,t5,t6,t7; | |||
unsigned long d[10]; | |||
unsigned long c; | |||
while (bytes >= poly1305_block_size) { | |||
size_t i, j; | |||
/* h += m[i] */ | |||
t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff; | |||
t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff; | |||
t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff; | |||
t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff; | |||
t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff; | |||
st->h[5] += ((t4 >> 1) ) & 0x1fff; | |||
t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff; | |||
t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff; | |||
t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff; | |||
st->h[9] += ((t7 >> 5) ) | hibit; | |||
/* h *= r, (partial) h %= p */ | |||
for (i = 0, c = 0; i < 10; i++) { | |||
d[i] = c; | |||
for (j = 0; j < 10; j++) { | |||
d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j])); | |||
/* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */ | |||
if (j == 4) { | |||
c = (d[i] >> 13); | |||
d[i] &= 0x1fff; | |||
} | |||
} | |||
c += (d[i] >> 13); | |||
d[i] &= 0x1fff; | |||
} | |||
c = ((c << 2) + c); /* c *= 5 */ | |||
c += d[0]; | |||
d[0] = ((unsigned short)c & 0x1fff); | |||
c = (c >> 13); | |||
d[1] += c; | |||
for (i = 0; i < 10; i++) | |||
st->h[i] = (unsigned short)d[i]; | |||
m += poly1305_block_size; | |||
bytes -= poly1305_block_size; | |||
} | |||
} | |||
POLY1305_NOINLINE void | |||
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
unsigned short c; | |||
unsigned short g[10]; | |||
unsigned short mask; | |||
unsigned long f; | |||
size_t i; | |||
/* process the remaining block */ | |||
if (st->leftover) { | |||
size_t i = st->leftover; | |||
st->buffer[i++] = 1; | |||
for (; i < poly1305_block_size; i++) | |||
st->buffer[i] = 0; | |||
st->final = 1; | |||
poly1305_blocks(st, st->buffer, poly1305_block_size); | |||
} | |||
/* fully carry h */ | |||
c = st->h[1] >> 13; | |||
st->h[1] &= 0x1fff; | |||
for (i = 2; i < 10; i++) { | |||
st->h[i] += c; | |||
c = st->h[i] >> 13; | |||
st->h[i] &= 0x1fff; | |||
} | |||
st->h[0] += (c * 5); | |||
c = st->h[0] >> 13; | |||
st->h[0] &= 0x1fff; | |||
st->h[1] += c; | |||
c = st->h[1] >> 13; | |||
st->h[1] &= 0x1fff; | |||
st->h[2] += c; | |||
/* compute h + -p */ | |||
g[0] = st->h[0] + 5; | |||
c = g[0] >> 13; | |||
g[0] &= 0x1fff; | |||
for (i = 1; i < 10; i++) { | |||
g[i] = st->h[i] + c; | |||
c = g[i] >> 13; | |||
g[i] &= 0x1fff; | |||
} | |||
g[9] -= (1 << 13); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1; | |||
for (i = 0; i < 10; i++) | |||
g[i] &= mask; | |||
mask = ~mask; | |||
for (i = 0; i < 10; i++) | |||
st->h[i] = (st->h[i] & mask) | g[i]; | |||
/* h = h % (2^128) */ | |||
st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff; | |||
st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff; | |||
st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff; | |||
st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff; | |||
st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff; | |||
st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff; | |||
st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff; | |||
st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff; | |||
/* mac = (h + pad) % (2^128) */ | |||
f = (unsigned long)st->h[0] + st->pad[0]; | |||
st->h[0] = (unsigned short)f; | |||
for (i = 1; i < 8; i++) { | |||
f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16); | |||
st->h[i] = (unsigned short)f; | |||
} | |||
for (i = 0; i < 8; i++) | |||
U16TO8(mac + (i * 2), st->h[i]); | |||
/* zero out the state */ | |||
for (i = 0; i < 10; i++) | |||
st->h[i] = 0; | |||
for (i = 0; i < 10; i++) | |||
st->r[i] = 0; | |||
for (i = 0; i < 8; i++) | |||
st->pad[i] = 0; | |||
} |
@@ -1,219 +0,0 @@ | |||
/* | |||
poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition | |||
*/ | |||
#if defined(_MSC_VER) | |||
#define POLY1305_NOINLINE __declspec(noinline) | |||
#elif defined(__GNUC__) | |||
#define POLY1305_NOINLINE __attribute__((noinline)) | |||
#else | |||
#define POLY1305_NOINLINE | |||
#endif | |||
#define poly1305_block_size 16 | |||
/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */ | |||
typedef struct poly1305_state_internal_t { | |||
unsigned long r[5]; | |||
unsigned long h[5]; | |||
unsigned long pad[4]; | |||
size_t leftover; | |||
unsigned char buffer[poly1305_block_size]; | |||
unsigned char final; | |||
} poly1305_state_internal_t; | |||
/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ | |||
static unsigned long | |||
U8TO32(const unsigned char *p) { | |||
return | |||
(((unsigned long)(p[0] & 0xff) ) | | |||
((unsigned long)(p[1] & 0xff) << 8) | | |||
((unsigned long)(p[2] & 0xff) << 16) | | |||
((unsigned long)(p[3] & 0xff) << 24)); | |||
} | |||
/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ | |||
static void | |||
U32TO8(unsigned char *p, unsigned long v) { | |||
p[0] = (v ) & 0xff; | |||
p[1] = (v >> 8) & 0xff; | |||
p[2] = (v >> 16) & 0xff; | |||
p[3] = (v >> 24) & 0xff; | |||
} | |||
void | |||
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff; | |||
st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03; | |||
st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff; | |||
st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff; | |||
st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff; | |||
/* h = 0 */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->h[3] = 0; | |||
st->h[4] = 0; | |||
/* save pad for later */ | |||
st->pad[0] = U8TO32(&key[16]); | |||
st->pad[1] = U8TO32(&key[20]); | |||
st->pad[2] = U8TO32(&key[24]); | |||
st->pad[3] = U8TO32(&key[28]); | |||
st->leftover = 0; | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { | |||
const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ | |||
unsigned long r0,r1,r2,r3,r4; | |||
unsigned long s1,s2,s3,s4; | |||
unsigned long h0,h1,h2,h3,h4; | |||
unsigned long long d0,d1,d2,d3,d4; | |||
unsigned long c; | |||
r0 = st->r[0]; | |||
r1 = st->r[1]; | |||
r2 = st->r[2]; | |||
r3 = st->r[3]; | |||
r4 = st->r[4]; | |||
s1 = r1 * 5; | |||
s2 = r2 * 5; | |||
s3 = r3 * 5; | |||
s4 = r4 * 5; | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
h3 = st->h[3]; | |||
h4 = st->h[4]; | |||
while (bytes >= poly1305_block_size) { | |||
/* h += m[i] */ | |||
h0 += (U8TO32(m+ 0) ) & 0x3ffffff; | |||
h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff; | |||
h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff; | |||
h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff; | |||
h4 += (U8TO32(m+12) >> 8) | hibit; | |||
/* h *= r */ | |||
d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1); | |||
d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2); | |||
d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3); | |||
d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4); | |||
d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0); | |||
/* (partial) h %= p */ | |||
c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff; | |||
d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff; | |||
d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff; | |||
d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff; | |||
d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff; | |||
h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; | |||
h1 += c; | |||
m += poly1305_block_size; | |||
bytes -= poly1305_block_size; | |||
} | |||
st->h[0] = h0; | |||
st->h[1] = h1; | |||
st->h[2] = h2; | |||
st->h[3] = h3; | |||
st->h[4] = h4; | |||
} | |||
POLY1305_NOINLINE void | |||
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
unsigned long h0,h1,h2,h3,h4,c; | |||
unsigned long g0,g1,g2,g3,g4; | |||
unsigned long long f; | |||
unsigned long mask; | |||
/* process the remaining block */ | |||
if (st->leftover) { | |||
size_t i = st->leftover; | |||
st->buffer[i++] = 1; | |||
for (; i < poly1305_block_size; i++) | |||
st->buffer[i] = 0; | |||
st->final = 1; | |||
poly1305_blocks(st, st->buffer, poly1305_block_size); | |||
} | |||
/* fully carry h */ | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
h3 = st->h[3]; | |||
h4 = st->h[4]; | |||
c = h1 >> 26; h1 = h1 & 0x3ffffff; | |||
h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; | |||
h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; | |||
h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; | |||
h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; | |||
h1 += c; | |||
/* compute h + -p */ | |||
g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; | |||
g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; | |||
g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; | |||
g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; | |||
g4 = h4 + c - (1 << 26); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1; | |||
g0 &= mask; | |||
g1 &= mask; | |||
g2 &= mask; | |||
g3 &= mask; | |||
g4 &= mask; | |||
mask = ~mask; | |||
h0 = (h0 & mask) | g0; | |||
h1 = (h1 & mask) | g1; | |||
h2 = (h2 & mask) | g2; | |||
h3 = (h3 & mask) | g3; | |||
h4 = (h4 & mask) | g4; | |||
/* h = h % (2^128) */ | |||
h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; | |||
h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; | |||
h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; | |||
h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; | |||
/* mac = (h + pad) % (2^128) */ | |||
f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f; | |||
f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f; | |||
f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f; | |||
f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f; | |||
U32TO8(mac + 0, h0); | |||
U32TO8(mac + 4, h1); | |||
U32TO8(mac + 8, h2); | |||
U32TO8(mac + 12, h3); | |||
/* zero out the state */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->h[3] = 0; | |||
st->h[4] = 0; | |||
st->r[0] = 0; | |||
st->r[1] = 0; | |||
st->r[2] = 0; | |||
st->r[3] = 0; | |||
st->r[4] = 0; | |||
st->pad[0] = 0; | |||
st->pad[1] = 0; | |||
st->pad[2] = 0; | |||
st->pad[3] = 0; | |||
} | |||
@@ -1,224 +0,0 @@ | |||
/* | |||
poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition | |||
*/ | |||
#if defined(_MSC_VER) | |||
#include <intrin.h> | |||
typedef struct uint128_t { | |||
unsigned long long lo; | |||
unsigned long long hi; | |||
} uint128_t; | |||
#define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi) | |||
#define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; } | |||
#define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); } | |||
#define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift))) | |||
#define LO(in) (in.lo) | |||
#define POLY1305_NOINLINE __declspec(noinline) | |||
#elif defined(__GNUC__) | |||
#if defined(__SIZEOF_INT128__) | |||
typedef unsigned __int128 uint128_t; | |||
#else | |||
typedef unsigned uint128_t __attribute__((mode(TI))); | |||
#endif | |||
#define MUL(out, x, y) out = ((uint128_t)x * y) | |||
#define ADD(out, in) out += in | |||
#define ADDLO(out, in) out += in | |||
#define SHR(in, shift) (unsigned long long)(in >> (shift)) | |||
#define LO(in) (unsigned long long)(in) | |||
#define POLY1305_NOINLINE __attribute__((noinline)) | |||
#endif | |||
#define poly1305_block_size 16 | |||
/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */ | |||
typedef struct poly1305_state_internal_t { | |||
unsigned long long r[3]; | |||
unsigned long long h[3]; | |||
unsigned long long pad[2]; | |||
size_t leftover; | |||
unsigned char buffer[poly1305_block_size]; | |||
unsigned char final; | |||
} poly1305_state_internal_t; | |||
/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ | |||
static unsigned long long | |||
U8TO64(const unsigned char *p) { | |||
return | |||
(((unsigned long long)(p[0] & 0xff) ) | | |||
((unsigned long long)(p[1] & 0xff) << 8) | | |||
((unsigned long long)(p[2] & 0xff) << 16) | | |||
((unsigned long long)(p[3] & 0xff) << 24) | | |||
((unsigned long long)(p[4] & 0xff) << 32) | | |||
((unsigned long long)(p[5] & 0xff) << 40) | | |||
((unsigned long long)(p[6] & 0xff) << 48) | | |||
((unsigned long long)(p[7] & 0xff) << 56)); | |||
} | |||
/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ | |||
static void | |||
U64TO8(unsigned char *p, unsigned long long v) { | |||
p[0] = (v ) & 0xff; | |||
p[1] = (v >> 8) & 0xff; | |||
p[2] = (v >> 16) & 0xff; | |||
p[3] = (v >> 24) & 0xff; | |||
p[4] = (v >> 32) & 0xff; | |||
p[5] = (v >> 40) & 0xff; | |||
p[6] = (v >> 48) & 0xff; | |||
p[7] = (v >> 56) & 0xff; | |||
} | |||
void | |||
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
unsigned long long t0,t1; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
t0 = U8TO64(&key[0]); | |||
t1 = U8TO64(&key[8]); | |||
st->r[0] = ( t0 ) & 0xffc0fffffff; | |||
st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; | |||
st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; | |||
/* h = 0 */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
/* save pad for later */ | |||
st->pad[0] = U8TO64(&key[16]); | |||
st->pad[1] = U8TO64(&key[24]); | |||
st->leftover = 0; | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { | |||
const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */ | |||
unsigned long long r0,r1,r2; | |||
unsigned long long s1,s2; | |||
unsigned long long h0,h1,h2; | |||
unsigned long long c; | |||
uint128_t d0,d1,d2,d; | |||
r0 = st->r[0]; | |||
r1 = st->r[1]; | |||
r2 = st->r[2]; | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
s1 = r1 * (5 << 2); | |||
s2 = r2 * (5 << 2); | |||
while (bytes >= poly1305_block_size) { | |||
unsigned long long t0,t1; | |||
/* h += m[i] */ | |||
t0 = U8TO64(&m[0]); | |||
t1 = U8TO64(&m[8]); | |||
h0 += (( t0 ) & 0xfffffffffff); | |||
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); | |||
h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; | |||
/* h *= r */ | |||
MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d); | |||
MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d); | |||
MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d); | |||
/* (partial) h %= p */ | |||
c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff; | |||
ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff; | |||
ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; | |||
h1 += c; | |||
m += poly1305_block_size; | |||
bytes -= poly1305_block_size; | |||
} | |||
st->h[0] = h0; | |||
st->h[1] = h1; | |||
st->h[2] = h2; | |||
} | |||
POLY1305_NOINLINE void | |||
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
unsigned long long h0,h1,h2,c; | |||
unsigned long long g0,g1,g2; | |||
unsigned long long t0,t1; | |||
/* process the remaining block */ | |||
if (st->leftover) { | |||
size_t i = st->leftover; | |||
st->buffer[i] = 1; | |||
for (i = i + 1; i < poly1305_block_size; i++) | |||
st->buffer[i] = 0; | |||
st->final = 1; | |||
poly1305_blocks(st, st->buffer, poly1305_block_size); | |||
} | |||
/* fully carry h */ | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += c; | |||
/* compute h + -p */ | |||
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; | |||
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; | |||
g2 = h2 + c - ((unsigned long long)1 << 42); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1; | |||
g0 &= c; | |||
g1 &= c; | |||
g2 &= c; | |||
c = ~c; | |||
h0 = (h0 & c) | g0; | |||
h1 = (h1 & c) | g1; | |||
h2 = (h2 & c) | g2; | |||
/* h = (h + pad) */ | |||
t0 = st->pad[0]; | |||
t1 = st->pad[1]; | |||
h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; | |||
/* mac = h % (2^128) */ | |||
h0 = ((h0 ) | (h1 << 44)); | |||
h1 = ((h1 >> 20) | (h2 << 24)); | |||
U64TO8(&mac[0], h0); | |||
U64TO8(&mac[8], h1); | |||
/* zero out the state */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->r[0] = 0; | |||
st->r[1] = 0; | |||
st->r[2] = 0; | |||
st->pad[0] = 0; | |||
st->pad[1] = 0; | |||
} | |||
@@ -1,186 +0,0 @@ | |||
/* | |||
poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition | |||
based on the public domain reference version in supercop by djb | |||
*/ | |||
#if defined(_MSC_VER) | |||
#define POLY1305_NOINLINE __declspec(noinline) | |||
#elif defined(__GNUC__) | |||
#define POLY1305_NOINLINE __attribute__((noinline)) | |||
#else | |||
#define POLY1305_NOINLINE | |||
#endif | |||
#define poly1305_block_size 16 | |||
/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */ | |||
typedef struct poly1305_state_internal_t { | |||
unsigned char buffer[poly1305_block_size]; | |||
size_t leftover; | |||
unsigned char h[17]; | |||
unsigned char r[17]; | |||
unsigned char pad[17]; | |||
unsigned char final; | |||
} poly1305_state_internal_t; | |||
void | |||
poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
size_t i; | |||
st->leftover = 0; | |||
/* h = 0 */ | |||
for (i = 0; i < 17; i++) | |||
st->h[i] = 0; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
st->r[ 0] = key[ 0] & 0xff; | |||
st->r[ 1] = key[ 1] & 0xff; | |||
st->r[ 2] = key[ 2] & 0xff; | |||
st->r[ 3] = key[ 3] & 0x0f; | |||
st->r[ 4] = key[ 4] & 0xfc; | |||
st->r[ 5] = key[ 5] & 0xff; | |||
st->r[ 6] = key[ 6] & 0xff; | |||
st->r[ 7] = key[ 7] & 0x0f; | |||
st->r[ 8] = key[ 8] & 0xfc; | |||
st->r[ 9] = key[ 9] & 0xff; | |||
st->r[10] = key[10] & 0xff; | |||
st->r[11] = key[11] & 0x0f; | |||
st->r[12] = key[12] & 0xfc; | |||
st->r[13] = key[13] & 0xff; | |||
st->r[14] = key[14] & 0xff; | |||
st->r[15] = key[15] & 0x0f; | |||
st->r[16] = 0; | |||
/* save pad for later */ | |||
for (i = 0; i < 16; i++) | |||
st->pad[i] = key[i + 16]; | |||
st->pad[16] = 0; | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_add(unsigned char h[17], const unsigned char c[17]) { | |||
unsigned short u; | |||
unsigned int i; | |||
for (u = 0, i = 0; i < 17; i++) { | |||
u += (unsigned short)h[i] + (unsigned short)c[i]; | |||
h[i] = (unsigned char)u & 0xff; | |||
u >>= 8; | |||
} | |||
} | |||
static void | |||
poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) { | |||
unsigned long u; | |||
unsigned int i; | |||
u = 0; | |||
for (i = 0; i < 16; i++) { | |||
u += hr[i]; | |||
h[i] = (unsigned char)u & 0xff; | |||
u >>= 8; | |||
} | |||
u += hr[16]; | |||
h[16] = (unsigned char)u & 0x03; | |||
u >>= 2; | |||
u += (u << 2); /* u *= 5; */ | |||
for (i = 0; i < 16; i++) { | |||
u += h[i]; | |||
h[i] = (unsigned char)u & 0xff; | |||
u >>= 8; | |||
} | |||
h[16] += (unsigned char)u; | |||
} | |||
static void | |||
poly1305_freeze(unsigned char h[17]) { | |||
static const unsigned char minusp[17] = { | |||
0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0xfc | |||
}; | |||
unsigned char horig[17], negative; | |||
unsigned int i; | |||
/* compute h + -p */ | |||
for (i = 0; i < 17; i++) | |||
horig[i] = h[i]; | |||
poly1305_add(h, minusp); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
negative = -(h[16] >> 7); | |||
for (i = 0; i < 17; i++) | |||
h[i] ^= negative & (horig[i] ^ h[i]); | |||
} | |||
static void | |||
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { | |||
const unsigned char hibit = st->final ^ 1; /* 1 << 128 */ | |||
while (bytes >= poly1305_block_size) { | |||
unsigned long hr[17], u; | |||
unsigned char c[17]; | |||
unsigned int i, j; | |||
/* h += m */ | |||
for (i = 0; i < 16; i++) | |||
c[i] = m[i]; | |||
c[16] = hibit; | |||
poly1305_add(st->h, c); | |||
/* h *= r */ | |||
for (i = 0; i < 17; i++) { | |||
u = 0; | |||
for (j = 0; j <= i ; j++) { | |||
u += (unsigned short)st->h[j] * st->r[i - j]; | |||
} | |||
for (j = i + 1; j < 17; j++) { | |||
unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j]; | |||
v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ | |||
u += v; | |||
} | |||
hr[i] = u; | |||
} | |||
/* (partial) h %= p */ | |||
poly1305_squeeze(st->h, hr); | |||
m += poly1305_block_size; | |||
bytes -= poly1305_block_size; | |||
} | |||
} | |||
POLY1305_NOINLINE void | |||
poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
size_t i; | |||
/* process the remaining block */ | |||
if (st->leftover) { | |||
size_t i = st->leftover; | |||
st->buffer[i++] = 1; | |||
for (; i < poly1305_block_size; i++) | |||
st->buffer[i] = 0; | |||
st->final = 1; | |||
poly1305_blocks(st, st->buffer, poly1305_block_size); | |||
} | |||
/* fully reduce h */ | |||
poly1305_freeze(st->h); | |||
/* h = (h + pad) % (1 << 128) */ | |||
poly1305_add(st->h, st->pad); | |||
for (i = 0; i < 16; i++) | |||
mac[i] = st->h[i]; | |||
/* zero out the state */ | |||
for (i = 0; i < 17; i++) | |||
st->h[i] = 0; | |||
for (i = 0; i < 17; i++) | |||
st->r[i] = 0; | |||
for (i = 0; i < 17; i++) | |||
st->pad[i] = 0; | |||
} |
@@ -1,201 +0,0 @@ | |||
#include "poly1305-donna.h" | |||
#if defined(POLY1305_8BIT) | |||
#include "poly1305-donna-8.h" | |||
#elif defined(POLY1305_16BIT) | |||
#include "poly1305-donna-16.h" | |||
#elif defined(POLY1305_32BIT) | |||
#include "poly1305-donna-32.h" | |||
#elif defined(POLY1305_64BIT) | |||
#include "poly1305-donna-64.h" | |||
#else | |||
/* auto detect between 32bit / 64bit */ | |||
#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__)) | |||
#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64)) | |||
#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)))) | |||
#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT) | |||
#include "poly1305-donna-64.h" | |||
#else | |||
#include "poly1305-donna-32.h" | |||
#endif | |||
#endif | |||
void | |||
poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) { | |||
poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; | |||
size_t i; | |||
/* handle leftover */ | |||
if (st->leftover) { | |||
size_t want = (poly1305_block_size - st->leftover); | |||
if (want > bytes) | |||
want = bytes; | |||
for (i = 0; i < want; i++) | |||
st->buffer[st->leftover + i] = m[i]; | |||
bytes -= want; | |||
m += want; | |||
st->leftover += want; | |||
if (st->leftover < poly1305_block_size) | |||
return; | |||
poly1305_blocks(st, st->buffer, poly1305_block_size); | |||
st->leftover = 0; | |||
} | |||
/* process full blocks */ | |||
if (bytes >= poly1305_block_size) { | |||
size_t want = (bytes & ~(poly1305_block_size - 1)); | |||
poly1305_blocks(st, m, want); | |||
m += want; | |||
bytes -= want; | |||
} | |||
/* store leftover */ | |||
if (bytes) { | |||
for (i = 0; i < bytes; i++) | |||
st->buffer[st->leftover + i] = m[i]; | |||
st->leftover += bytes; | |||
} | |||
} | |||
void | |||
poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) { | |||
poly1305_context ctx; | |||
poly1305_init(&ctx, key); | |||
poly1305_update(&ctx, m, bytes); | |||
poly1305_finish(&ctx, mac); | |||
} | |||
int | |||
poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) { | |||
size_t i; | |||
unsigned int dif = 0; | |||
for (i = 0; i < 16; i++) | |||
dif |= (mac1[i] ^ mac2[i]); | |||
dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); | |||
return (dif & 1); | |||
} | |||
/* test a few basic operations */ | |||
int | |||
poly1305_power_on_self_test(void) { | |||
/* example from nacl */ | |||
static const unsigned char nacl_key[32] = { | |||
0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91, | |||
0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25, | |||
0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65, | |||
0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80, | |||
}; | |||
static const unsigned char nacl_msg[131] = { | |||
0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73, | |||
0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce, | |||
0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4, | |||
0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a, | |||
0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b, | |||
0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72, | |||
0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2, | |||
0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38, | |||
0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a, | |||
0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae, | |||
0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea, | |||
0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda, | |||
0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde, | |||
0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3, | |||
0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6, | |||
0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74, | |||
0xe3,0x55,0xa5 | |||
}; | |||
static const unsigned char nacl_mac[16] = { | |||
0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5, | |||
0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9 | |||
}; | |||
/* generates a final value of (2^130 - 2) == 3 */ | |||
static const unsigned char wrap_key[32] = { | |||
0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
}; | |||
static const unsigned char wrap_msg[16] = { | |||
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, | |||
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff | |||
}; | |||
static const unsigned char wrap_mac[16] = { | |||
0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, | |||
}; | |||
/* | |||
mac of the macs of messages of length 0 to 256, where the key and messages | |||
have all their values set to the length | |||
*/ | |||
static const unsigned char total_key[32] = { | |||
0x01,0x02,0x03,0x04,0x05,0x06,0x07, | |||
0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9, | |||
0xff,0xff,0xff,0xff,0xff,0xff,0xff, | |||
0xff,0xff,0xff,0xff,0xff,0xff,0xff | |||
}; | |||
static const unsigned char total_mac[16] = { | |||
0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd, | |||
0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39 | |||
}; | |||
poly1305_context ctx; | |||
poly1305_context total_ctx; | |||
unsigned char all_key[32]; | |||
unsigned char all_msg[256]; | |||
unsigned char mac[16]; | |||
size_t i, j; | |||
int result = 1; | |||
for (i = 0; i < sizeof(mac); i++) | |||
mac[i] = 0; | |||
poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key); | |||
result &= poly1305_verify(nacl_mac, mac); | |||
for (i = 0; i < sizeof(mac); i++) | |||
mac[i] = 0; | |||
poly1305_init(&ctx, nacl_key); | |||
poly1305_update(&ctx, nacl_msg + 0, 32); | |||
poly1305_update(&ctx, nacl_msg + 32, 64); | |||
poly1305_update(&ctx, nacl_msg + 96, 16); | |||
poly1305_update(&ctx, nacl_msg + 112, 8); | |||
poly1305_update(&ctx, nacl_msg + 120, 4); | |||
poly1305_update(&ctx, nacl_msg + 124, 2); | |||
poly1305_update(&ctx, nacl_msg + 126, 1); | |||
poly1305_update(&ctx, nacl_msg + 127, 1); | |||
poly1305_update(&ctx, nacl_msg + 128, 1); | |||
poly1305_update(&ctx, nacl_msg + 129, 1); | |||
poly1305_update(&ctx, nacl_msg + 130, 1); | |||
poly1305_finish(&ctx, mac); | |||
result &= poly1305_verify(nacl_mac, mac); | |||
for (i = 0; i < sizeof(mac); i++) | |||
mac[i] = 0; | |||
poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key); | |||
result &= poly1305_verify(wrap_mac, mac); | |||
poly1305_init(&total_ctx, total_key); | |||
for (i = 0; i < 256; i++) { | |||
/* set key and message to 'i,i,i..' */ | |||
for (j = 0; j < sizeof(all_key); j++) | |||
all_key[j] = i; | |||
for (j = 0; j < i; j++) | |||
all_msg[j] = i; | |||
poly1305_auth(mac, all_msg, i, all_key); | |||
poly1305_update(&total_ctx, mac, 16); | |||
} | |||
poly1305_finish(&total_ctx, mac); | |||
result &= poly1305_verify(total_mac, mac); | |||
return result; | |||
} |
@@ -1,20 +0,0 @@ | |||
#ifndef POLY1305_DONNA_H | |||
#define POLY1305_DONNA_H | |||
#include <stddef.h> | |||
typedef struct poly1305_context { | |||
size_t aligner; | |||
unsigned char opaque[136]; | |||
} poly1305_context; | |||
void poly1305_init(poly1305_context *ctx, const unsigned char key[32]); | |||
void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes); | |||
void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]); | |||
void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]); | |||
int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); | |||
int poly1305_power_on_self_test(void); | |||
#endif /* POLY1305_DONNA_H */ | |||
@@ -0,0 +1,222 @@ | |||
/* | |||
* Copyright (c) 2015, Vsevolod Stakhov | |||
* Copyright (c) 2015, Andrew Moon | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are met: | |||
* * Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* * Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in the | |||
* documentation and/or other materials provided with the distribution. | |||
* | |||
* THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY | |||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |||
* DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY | |||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*/ | |||
#include "config.h" | |||
#include "cryptobox.h" | |||
#include "poly1305.h" | |||
#include "platform_config.h" | |||
extern unsigned long cpu_config; | |||
typedef struct poly1305_state_internal_t | |||
{ | |||
unsigned char opaque[192]; /* largest state required (AVX2) */ | |||
size_t leftover, block_size; | |||
unsigned char buffer[64]; /* largest blocksize (AVX2) */ | |||
} poly1305_state_internal; | |||
typedef struct poly1305_impl_t | |||
{ | |||
unsigned long cpu_flags; | |||
const char *desc; | |||
size_t (*block_size)(void); | |||
void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint); | |||
void (*blocks)(void *state, const unsigned char *in, size_t inlen); | |||
void (*finish_ext)(void *state, const unsigned char *in, size_t remaining, | |||
unsigned char *mac); | |||
void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen, | |||
const poly1305_key *key); | |||
} poly1305_impl_t; | |||
#define POLY1305_DECLARE(ext) \ | |||
size_t poly1305_block_size_##ext(void); \ | |||
void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \ | |||
void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \ | |||
void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \ | |||
void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key); | |||
#define POLY1305_IMPL(cpuflags, desc, ext) \ | |||
{(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext} | |||
#if defined(HAVE_AVX2) | |||
POLY1305_DECLARE(avx2) | |||
#define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2) | |||
#endif | |||
#if defined(HAVE_AVX) | |||
POLY1305_DECLARE(avx) | |||
#define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx) | |||
#endif | |||
#if defined(HAVE_SSE2) | |||
POLY1305_DECLARE(sse2) | |||
#define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2) | |||
#endif | |||
POLY1305_DECLARE(ref) | |||
#define POLY1305_GENERIC POLY1305_IMPL(0, "generic", ref) | |||
/* list implemenations from most optimized to least, with generic as the last entry */ | |||
static const poly1305_impl_t poly1305_list[] = | |||
{ | |||
POLY1305_GENERIC, | |||
#if defined(POLY1305_AVX2) | |||
POLY1305_AVX2, | |||
#endif | |||
#if defined(POLY1305_AVX) | |||
POLY1305_AVX, | |||
#endif | |||
#if defined(POLY1305_SSE2) | |||
POLY1305_SSE2, | |||
#endif | |||
}; | |||
static const poly1305_impl_t *poly1305_opt = &poly1305_list[0]; | |||
; | |||
/* is the pointer aligned on a word boundary? */ | |||
static int poly1305_is_aligned(const void *p) | |||
{ | |||
return ((size_t) p & (sizeof(size_t) - 1)) == 0; | |||
} | |||
void poly1305_load(void) | |||
{ | |||
guint i; | |||
if (cpu_config != 0) { | |||
for (i = 0; i < G_N_ELEMENTS(poly1305_list); i++) { | |||
if (poly1305_list[i].cpu_flags & cpu_config) { | |||
poly1305_opt = &poly1305_list[i]; | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
/* processes inlen bytes (full blocks only), handling input alignment */ | |||
static void poly1305_consume(poly1305_state_internal *state, | |||
const unsigned char *in, size_t inlen) | |||
{ | |||
int in_aligned; | |||
/* it's ok to call with 0 bytes */ | |||
if (!inlen) | |||
return; | |||
/* if everything is aligned, handle directly */ | |||
in_aligned = poly1305_is_aligned (in); | |||
if (in_aligned) { | |||
poly1305_opt->blocks (state->opaque, in, inlen); | |||
return; | |||
} | |||
/* copy the unaligned data to an aligned buffer and process in chunks */ | |||
while (inlen) { | |||
unsigned char buffer[1024]; | |||
const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen; | |||
memcpy (buffer, in, bytes); | |||
poly1305_opt->blocks (state->opaque, buffer, bytes); | |||
in += bytes; | |||
inlen -= bytes; | |||
} | |||
} | |||
void poly1305_init(poly1305_state *S, const poly1305_key *key) | |||
{ | |||
poly1305_state_internal *state = (poly1305_state_internal *) S; | |||
poly1305_opt->init_ext (state->opaque, key, 0); | |||
state->leftover = 0; | |||
state->block_size = poly1305_opt->block_size (); | |||
} | |||
void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, | |||
size_t bytes_hint) | |||
{ | |||
poly1305_state_internal *state = (poly1305_state_internal *) S; | |||
poly1305_opt->init_ext (state->opaque, key, bytes_hint); | |||
state->leftover = 0; | |||
state->block_size = poly1305_opt->block_size (); | |||
} | |||
void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen) | |||
{ | |||
poly1305_state_internal *state = (poly1305_state_internal *) S; | |||
/* handle leftover */ | |||
if (state->leftover) { | |||
size_t want = (state->block_size - state->leftover); | |||
if (want > inlen) | |||
want = inlen; | |||
memcpy (state->buffer + state->leftover, in, want); | |||
inlen -= want; | |||
in += want; | |||
state->leftover += want; | |||
if (state->leftover < state->block_size) | |||
return; | |||
poly1305_opt->blocks (state->opaque, state->buffer, state->block_size); | |||
state->leftover = 0; | |||
} | |||
/* process full blocks */ | |||
if (inlen >= state->block_size) { | |||
size_t want = (inlen & ~(state->block_size - 1)); | |||
poly1305_consume (state, in, want); | |||
in += want; | |||
inlen -= want; | |||
} | |||
/* store leftover */ | |||
if (inlen) { | |||
memcpy (state->buffer + state->leftover, in, inlen); | |||
state->leftover += inlen; | |||
} | |||
} | |||
void poly1305_finish(poly1305_state *S, unsigned char *mac) | |||
{ | |||
poly1305_state_internal *state = (poly1305_state_internal *) S; | |||
poly1305_opt->finish_ext (state->opaque, state->buffer, state->leftover, | |||
mac); | |||
} | |||
void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, | |||
const poly1305_key *key) | |||
{ | |||
poly1305_opt->auth (mac, in, inlen, key); | |||
} | |||
int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) | |||
{ | |||
size_t i; | |||
unsigned int dif = 0; | |||
for (i = 0; i < 16; i++) { | |||
dif |= (mac1[i] ^ mac2[i]); | |||
} | |||
dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); | |||
return (dif & 1); | |||
} |
@@ -0,0 +1,38 @@ | |||
#ifndef POLY1305_H | |||
#define POLY1305_H | |||
#include <stddef.h> | |||
#if defined(__cplusplus) | |||
extern "C" | |||
{ | |||
#endif | |||
typedef struct poly1305_state | |||
{ | |||
unsigned char opaque[320]; | |||
} poly1305_state; | |||
typedef struct poly1305_key | |||
{ | |||
unsigned char b[32]; | |||
} poly1305_key; | |||
void poly1305_init(poly1305_state *S, const poly1305_key *key); | |||
void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, | |||
size_t bytes_hint); | |||
void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen); | |||
void poly1305_finish(poly1305_state *S, unsigned char *mac); | |||
void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, | |||
const poly1305_key *key); | |||
int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); | |||
void poly1305_load(void); | |||
#if defined(__cplusplus) | |||
} | |||
#endif | |||
#endif /* POLY1305_H */ | |||
@@ -0,0 +1,237 @@ | |||
/* | |||
poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition | |||
assumes the existence of uint32_t and uint64_t | |||
*/ | |||
#include "config.h" | |||
enum { | |||
POLY1305_BLOCK_SIZE = 16 | |||
}; | |||
typedef struct poly1305_state_ref_t { | |||
uint32_t r[5]; | |||
uint32_t h[5]; | |||
uint32_t pad[4]; | |||
unsigned char final; | |||
} poly1305_state_ref_t; | |||
/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ | |||
static uint32_t | |||
U8TO32(const unsigned char *p) { | |||
return | |||
(((uint32_t)(p[0] & 0xff) ) | | |||
((uint32_t)(p[1] & 0xff) << 8) | | |||
((uint32_t)(p[2] & 0xff) << 16) | | |||
((uint32_t)(p[3] & 0xff) << 24)); | |||
} | |||
/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ | |||
static void | |||
U32TO8(unsigned char *p, uint32_t v) { | |||
p[0] = (unsigned char)((v ) & 0xff); | |||
p[1] = (unsigned char)((v >> 8) & 0xff); | |||
p[2] = (unsigned char)((v >> 16) & 0xff); | |||
p[3] = (unsigned char)((v >> 24) & 0xff); | |||
} | |||
static size_t | |||
poly1305_block_size_ref(void) { | |||
return POLY1305_BLOCK_SIZE; | |||
} | |||
static void | |||
poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
/* bytes_hint not used */ | |||
(void)bytes_hint; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff; | |||
st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03; | |||
st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff; | |||
st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff; | |||
st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff; | |||
/* h = 0 */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->h[3] = 0; | |||
st->h[4] = 0; | |||
/* save pad for later */ | |||
st->pad[0] = U8TO32(&key->b[16]); | |||
st->pad[1] = U8TO32(&key->b[20]); | |||
st->pad[2] = U8TO32(&key->b[24]); | |||
st->pad[3] = U8TO32(&key->b[28]); | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ | |||
uint32_t r0,r1,r2,r3,r4; | |||
uint32_t s1,s2,s3,s4; | |||
uint32_t h0,h1,h2,h3,h4; | |||
uint64_t d0,d1,d2,d3,d4; | |||
uint32_t c; | |||
r0 = st->r[0]; | |||
r1 = st->r[1]; | |||
r2 = st->r[2]; | |||
r3 = st->r[3]; | |||
r4 = st->r[4]; | |||
s1 = r1 * 5; | |||
s2 = r2 * 5; | |||
s3 = r3 * 5; | |||
s4 = r4 * 5; | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
h3 = st->h[3]; | |||
h4 = st->h[4]; | |||
while (inlen >= POLY1305_BLOCK_SIZE) { | |||
/* h += m[i] */ | |||
h0 += (U8TO32(in+ 0) ) & 0x3ffffff; | |||
h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff; | |||
h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff; | |||
h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff; | |||
h4 += (U8TO32(in+12) >> 8) | hibit; | |||
/* h *= r */ | |||
d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1); | |||
d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2); | |||
d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3); | |||
d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4); | |||
d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0); | |||
/* (partial) h %= p */ | |||
c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff; | |||
d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff; | |||
d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff; | |||
d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff; | |||
d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff; | |||
h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; | |||
h1 += c; | |||
in += POLY1305_BLOCK_SIZE; | |||
inlen -= POLY1305_BLOCK_SIZE; | |||
} | |||
st->h[0] = h0; | |||
st->h[1] = h1; | |||
st->h[2] = h2; | |||
st->h[3] = h3; | |||
st->h[4] = h4; | |||
} | |||
static void | |||
poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
uint32_t h0,h1,h2,h3,h4,c; | |||
uint32_t g0,g1,g2,g3,g4; | |||
uint64_t f; | |||
uint32_t mask; | |||
/* process the remaining block */ | |||
if (remaining) { | |||
unsigned char final[POLY1305_BLOCK_SIZE] = {0}; | |||
size_t i; | |||
for (i = 0; i < remaining; i++) | |||
final[i] = in[i]; | |||
final[remaining] = 1; | |||
st->final = 1; | |||
poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); | |||
} | |||
/* fully carry h */ | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
h3 = st->h[3]; | |||
h4 = st->h[4]; | |||
c = h1 >> 26; h1 = h1 & 0x3ffffff; | |||
h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; | |||
h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; | |||
h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; | |||
h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; | |||
h1 += c; | |||
/* compute h + -p */ | |||
g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; | |||
g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; | |||
g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; | |||
g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; | |||
g4 = h4 + c - (1 << 26); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; | |||
g0 &= mask; | |||
g1 &= mask; | |||
g2 &= mask; | |||
g3 &= mask; | |||
g4 &= mask; | |||
mask = ~mask; | |||
h0 = (h0 & mask) | g0; | |||
h1 = (h1 & mask) | g1; | |||
h2 = (h2 & mask) | g2; | |||
h3 = (h3 & mask) | g3; | |||
h4 = (h4 & mask) | g4; | |||
/* h = h % (2^128) */ | |||
h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; | |||
h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; | |||
h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; | |||
h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; | |||
/* mac = (h + pad) % (2^128) */ | |||
f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f; | |||
f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f; | |||
f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f; | |||
f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f; | |||
U32TO8(mac + 0, h0); | |||
U32TO8(mac + 4, h1); | |||
U32TO8(mac + 8, h2); | |||
U32TO8(mac + 12, h3); | |||
/* zero out the state */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->h[3] = 0; | |||
st->h[4] = 0; | |||
st->r[0] = 0; | |||
st->r[1] = 0; | |||
st->r[2] = 0; | |||
st->r[3] = 0; | |||
st->r[4] = 0; | |||
st->pad[0] = 0; | |||
st->pad[1] = 0; | |||
st->pad[2] = 0; | |||
st->pad[3] = 0; | |||
} | |||
static void | |||
poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { | |||
poly1305_state_ref_t st; | |||
size_t blocks; | |||
poly1305_init_ext_ref(&st, key, inlen); | |||
blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); | |||
if (blocks) { | |||
poly1305_blocks_ref(&st, in, blocks); | |||
in += blocks; | |||
inlen -= blocks; | |||
} | |||
poly1305_finish_ext_ref(&st, in, inlen, mac); | |||
} | |||
@@ -0,0 +1,231 @@ | |||
/* | |||
poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition | |||
assumes the existence of uint64_t and uint128_t | |||
*/ | |||
#include "config.h" | |||
enum { | |||
POLY1305_BLOCK_SIZE = 16 | |||
}; | |||
#if defined(_MSC_VER) | |||
#include <intrin.h> | |||
typedef struct uint128_t { | |||
unsigned long long lo; | |||
unsigned long long hi; | |||
} uint128_t; | |||
#define POLY1305_NOINLINE __declspec(noinline) | |||
#elif defined(__GNUC__) | |||
#if defined(__SIZEOF_INT128__) | |||
typedef unsigned __int128 uint128_t; | |||
#else | |||
typedef unsigned uint128_t __attribute__((mode(TI))); | |||
#endif | |||
#define POLY1305_NOINLINE __attribute__((noinline)) | |||
#endif | |||
typedef struct poly1305_state_ref_t { | |||
uint64_t r[3]; | |||
uint64_t h[3]; | |||
uint64_t pad[2]; | |||
unsigned char final; | |||
} poly1305_state_ref_t; | |||
/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ | |||
static uint64_t | |||
U8TO64(const unsigned char *p) { | |||
return | |||
((uint64_t)p[0] ) | | |||
((uint64_t)p[1] << 8) | | |||
((uint64_t)p[2] << 16) | | |||
((uint64_t)p[3] << 24) | | |||
((uint64_t)p[4] << 32) | | |||
((uint64_t)p[5] << 40) | | |||
((uint64_t)p[6] << 48) | | |||
((uint64_t)p[7] << 56); | |||
} | |||
/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ | |||
static void | |||
U64TO8(unsigned char *p, uint64_t v) { | |||
p[0] = (unsigned char)(v ) & 0xff; | |||
p[1] = (unsigned char)(v >> 8) & 0xff; | |||
p[2] = (unsigned char)(v >> 16) & 0xff; | |||
p[3] = (unsigned char)(v >> 24) & 0xff; | |||
p[4] = (unsigned char)(v >> 32) & 0xff; | |||
p[5] = (unsigned char)(v >> 40) & 0xff; | |||
p[6] = (unsigned char)(v >> 48) & 0xff; | |||
p[7] = (unsigned char)(v >> 56) & 0xff; | |||
} | |||
static size_t | |||
poly1305_block_size_ref(void) { | |||
return POLY1305_BLOCK_SIZE; | |||
} | |||
static void | |||
poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
uint64_t t0, t1; | |||
/* bytes_hint not used */ | |||
(void)bytes_hint; | |||
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ | |||
t0 = U8TO64(&key->b[0]); | |||
t1 = U8TO64(&key->b[8]); | |||
st->r[0] = ( t0 ) & 0xffc0fffffff; | |||
st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; | |||
st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; | |||
/* h = 0 */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
/* save pad for later */ | |||
st->pad[0] = U8TO64(&key->b[16]); | |||
st->pad[1] = U8TO64(&key->b[24]); | |||
st->final = 0; | |||
} | |||
static void | |||
poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */ | |||
uint64_t r0,r1,r2; | |||
uint64_t s1,s2; | |||
uint64_t h0,h1,h2; | |||
uint64_t c; | |||
uint128_t d0,d1,d2; | |||
r0 = st->r[0]; | |||
r1 = st->r[1]; | |||
r2 = st->r[2]; | |||
s1 = r1 * (5 << 2); | |||
s2 = r2 * (5 << 2); | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
while (inlen >= POLY1305_BLOCK_SIZE) { | |||
uint64_t t0, t1; | |||
/* h += in[i] */ | |||
t0 = U8TO64(in + 0); | |||
t1 = U8TO64(in + 8); | |||
h0 += (( t0 ) & 0xfffffffffff); | |||
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); | |||
h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; | |||
/* h *= r */ | |||
d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1); | |||
d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2); | |||
d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0); | |||
/* (partial) h %= p */ | |||
c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff; | |||
d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff; | |||
d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; | |||
h1 += c; | |||
in += POLY1305_BLOCK_SIZE; | |||
inlen -= POLY1305_BLOCK_SIZE; | |||
} | |||
st->h[0] = h0; | |||
st->h[1] = h1; | |||
st->h[2] = h2; | |||
} | |||
static void | |||
poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { | |||
poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; | |||
uint64_t h0, h1, h2, c; | |||
uint64_t g0, g1, g2; | |||
uint64_t t0, t1; | |||
/* process the remaining block */ | |||
if (remaining) { | |||
unsigned char final[POLY1305_BLOCK_SIZE] = {0}; | |||
size_t i; | |||
for (i = 0; i < remaining; i++) | |||
final[i] = in[i]; | |||
final[remaining] = 1; | |||
st->final = 1; | |||
poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); | |||
} | |||
/* fully carry h */ | |||
h0 = st->h[0]; | |||
h1 = st->h[1]; | |||
h2 = st->h[2]; | |||
c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; | |||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += c; | |||
/* compute h + -p */ | |||
g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; | |||
g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; | |||
g2 = h2 + c - ((uint64_t)1 << 42); | |||
/* select h if h < p, or h + -p if h >= p */ | |||
c = (g2 >> 63) - 1; | |||
h0 = (h0 & ~c) | (g0 & c); | |||
h1 = (h1 & ~c) | (g1 & c); | |||
h2 = (h2 & ~c) | (g2 & c); | |||
/* h = (h + pad) */ | |||
t0 = st->pad[0]; | |||
t1 = st->pad[1]; | |||
h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; | |||
h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; | |||
h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; | |||
/* mac = h % (2^128) */ | |||
h0 = ((h0 ) | (h1 << 44)); | |||
h1 = ((h1 >> 20) | (h2 << 24)); | |||
U64TO8(&mac[0], h0); | |||
U64TO8(&mac[8], h1); | |||
/* zero out the state */ | |||
st->h[0] = 0; | |||
st->h[1] = 0; | |||
st->h[2] = 0; | |||
st->r[0] = 0; | |||
st->r[1] = 0; | |||
st->r[2] = 0; | |||
st->pad[0] = 0; | |||
st->pad[1] = 0; | |||
} | |||
static void | |||
poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { | |||
poly1305_state_ref_t st; | |||
size_t blocks; | |||
poly1305_init_ext_ref(&st, key, inlen); | |||
blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); | |||
if (blocks) { | |||
poly1305_blocks_ref(&st, in, blocks); | |||
in += blocks; | |||
inlen -= blocks; | |||
} | |||
poly1305_finish_ext_ref(&st, in, inlen, mac); | |||
} | |||
@@ -0,0 +1,966 @@ | |||
#include "../chacha20/macro.S" | |||
#include "constants.S" | |||
SECTION_TEXT | |||
GLOBAL_HIDDEN_FN poly1305_block_size_sse2 | |||
movl $32, %eax | |||
ret | |||
FN_END poly1305_block_size_sse2 | |||
GLOBAL_HIDDEN_FN poly1305_init_ext_sse2 | |||
poly1305_init_ext_sse2_local: | |||
pushq %r15 | |||
xorps %xmm0, %xmm0 | |||
testq %rdx, %rdx | |||
pushq %r14 | |||
movq %rdx, %r11 | |||
movq $-1, %rax | |||
cmove %rax, %r11 | |||
pushq %r13 | |||
movabsq $17575274610687, %r9 | |||
pushq %r12 | |||
pushq %rbp | |||
movq %r11, %r13 | |||
movabsq $17592186044415, %rbp | |||
pushq %rbx | |||
xorl %ebx, %ebx | |||
movdqu %xmm0, 32(%rdi) | |||
movdqu %xmm0, (%rdi) | |||
movdqu %xmm0, 16(%rdi) | |||
movq 8(%rsi), %rcx | |||
movq (%rsi), %rax | |||
movq %rcx, %rdx | |||
shrq $24, %rcx | |||
andq %rax, %r9 | |||
salq $20, %rdx | |||
shrq $44, %rax | |||
movq %r9, %r8 | |||
orq %rax, %rdx | |||
shrq $26, %r8 | |||
movabsq $17592181915647, %rax | |||
andq %rax, %rdx | |||
movabsq $68719475727, %rax | |||
andq %rax, %rcx | |||
movl %r9d, %eax | |||
andl $67108863, %eax | |||
movl %eax, 40(%rdi) | |||
movl %edx, %eax | |||
sall $18, %eax | |||
orl %r8d, %eax | |||
movq %rdx, %r8 | |||
andl $67108863, %eax | |||
shrq $34, %r8 | |||
movl %eax, 44(%rdi) | |||
movq %rdx, %rax | |||
shrq $8, %rax | |||
andl $67108863, %eax | |||
movl %eax, 48(%rdi) | |||
movl %ecx, %eax | |||
sall $10, %eax | |||
orl %r8d, %eax | |||
movq %rdi, %r8 | |||
andl $67108863, %eax | |||
movl %eax, 52(%rdi) | |||
movq %rcx, %rax | |||
shrq $16, %rax | |||
movl %eax, 56(%rdi) | |||
movq 16(%rsi), %rax | |||
movq %rax, 104(%rdi) | |||
movq 24(%rsi), %rax | |||
movq %rdx, %rsi | |||
movq %rax, 112(%rdi) | |||
poly1305_init_ext_sse2_7: | |||
testq %rbx, %rbx | |||
jne poly1305_init_ext_sse2_4 | |||
cmpq $16, %r13 | |||
jbe poly1305_init_ext_sse2_5 | |||
leaq 60(%r8), %rdi | |||
jmp poly1305_init_ext_sse2_6 | |||
poly1305_init_ext_sse2_4: | |||
cmpq $96, %r13 | |||
jb poly1305_init_ext_sse2_5 | |||
leaq 80(%r8), %rdi | |||
poly1305_init_ext_sse2_6: | |||
imulq $20, %rcx, %r10 | |||
movq $0, -48(%rsp) | |||
movq $0, -32(%rsp) | |||
leaq (%rsi,%rsi), %r14 | |||
leaq (%r9,%r9), %r11 | |||
movq %r10, %rax | |||
mulq %r14 | |||
movq %rax, %r14 | |||
movq %r9, %rax | |||
movq %rdx, %r15 | |||
mulq %r9 | |||
addq %rax, %r14 | |||
movq %r14, %rax | |||
adcq %rdx, %r15 | |||
leaq (%rcx,%rcx), %rdx | |||
andq %rbp, %rax | |||
movq %rax, -16(%rsp) | |||
movq %r11, %rax | |||
movq %rdx, -24(%rsp) | |||
mulq %rsi | |||
movq %rax, %r11 | |||
movq %r10, %rax | |||
movq %rdx, %r12 | |||
mulq %rcx | |||
movq -16(%rsp), %rcx | |||
addq %rax, %r11 | |||
movq %r14, %rax | |||
adcq %rdx, %r12 | |||
shrdq $44, %r15, %rax | |||
movq %rax, -56(%rsp) | |||
movq -24(%rsp), %rax | |||
addq -56(%rsp), %r11 | |||
adcq -48(%rsp), %r12 | |||
mulq %r9 | |||
movq %r11, %r14 | |||
andq %rbp, %r14 | |||
movq %rax, %r9 | |||
movq %rsi, %rax | |||
movq %rdx, %r10 | |||
mulq %rsi | |||
addq %rax, %r9 | |||
movq %r11, %rax | |||
adcq %rdx, %r10 | |||
shrdq $44, %r12, %rax | |||
movq %rax, -40(%rsp) | |||
movabsq $4398046511103, %rax | |||
addq -40(%rsp), %r9 | |||
adcq -32(%rsp), %r10 | |||
andq %r9, %rax | |||
incq %rbx | |||
shrdq $42, %r10, %r9 | |||
leaq (%r9,%r9,4), %r9 | |||
addq %r9, %rcx | |||
movq %rcx, %r9 | |||
shrq $44, %rcx | |||
addq %r14, %rcx | |||
andq %rbp, %r9 | |||
movq %rcx, %rsi | |||
shrq $44, %rcx | |||
movq %r9, %rdx | |||
addq %rax, %rcx | |||
movl %r9d, %eax | |||
andq %rbp, %rsi | |||
andl $67108863, %eax | |||
shrq $26, %rdx | |||
movl %eax, (%rdi) | |||
movl %esi, %eax | |||
sall $18, %eax | |||
orl %edx, %eax | |||
movq %rsi, %rdx | |||
andl $67108863, %eax | |||
shrq $34, %rdx | |||
movl %eax, 4(%rdi) | |||
movq %rsi, %rax | |||
shrq $8, %rax | |||
andl $67108863, %eax | |||
movl %eax, 8(%rdi) | |||
movl %ecx, %eax | |||
sall $10, %eax | |||
orl %edx, %eax | |||
andl $67108863, %eax | |||
movl %eax, 12(%rdi) | |||
movq %rcx, %rax | |||
shrq $16, %rax | |||
cmpq $2, %rbx | |||
movl %eax, 16(%rdi) | |||
jne poly1305_init_ext_sse2_7 | |||
poly1305_init_ext_sse2_5: | |||
movq $0, 120(%r8) | |||
popq %rbx | |||
popq %rbp | |||
popq %r12 | |||
popq %r13 | |||
popq %r14 | |||
popq %r15 | |||
ret | |||
FN_END poly1305_init_ext_sse2 | |||
GLOBAL_HIDDEN_FN poly1305_blocks_sse2 | |||
poly1305_blocks_sse2_local: | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
pushq %rbx | |||
andq $-64, %rsp | |||
subq $328, %rsp | |||
movq $(1 << 24), %rax | |||
movd %rax, %xmm1 | |||
movq $((1 << 26) - 1), %rax | |||
movd %rax, %xmm0 | |||
pshufd $68, %xmm1, %xmm1 | |||
pshufd $68, %xmm0, %xmm0 | |||
movq 120(%rdi), %rax | |||
movaps %xmm1, 312(%rsp) | |||
testb $4, %al | |||
je poly1305_blocks_sse2_11 | |||
movaps 312(%rsp), %xmm1 | |||
psrldq $8, %xmm1 | |||
movaps %xmm1, 312(%rsp) | |||
poly1305_blocks_sse2_11: | |||
testb $8, %al | |||
je poly1305_blocks_sse2_12 | |||
xorps %xmm1, %xmm1 | |||
movaps %xmm1, 312(%rsp) | |||
poly1305_blocks_sse2_12: | |||
testb $1, %al | |||
jne poly1305_blocks_sse2_13 | |||
movq 16(%rsi), %xmm1 | |||
movaps %xmm0, %xmm3 | |||
movaps %xmm0, %xmm9 | |||
movq (%rsi), %xmm15 | |||
orq $1, %rax | |||
subq $32, %rdx | |||
movq 8(%rsi), %xmm12 | |||
punpcklqdq %xmm1, %xmm15 | |||
movq 24(%rsi), %xmm1 | |||
movaps %xmm15, %xmm8 | |||
pand %xmm15, %xmm3 | |||
psrlq $52, %xmm15 | |||
addq $32, %rsi | |||
punpcklqdq %xmm1, %xmm12 | |||
movaps %xmm12, %xmm1 | |||
psrlq $26, %xmm8 | |||
psllq $12, %xmm1 | |||
pand %xmm0, %xmm8 | |||
movq %rax, 120(%rdi) | |||
por %xmm1, %xmm15 | |||
psrlq $40, %xmm12 | |||
pand %xmm15, %xmm9 | |||
por 312(%rsp), %xmm12 | |||
psrlq $26, %xmm15 | |||
pand %xmm0, %xmm15 | |||
jmp poly1305_blocks_sse2_14 | |||
poly1305_blocks_sse2_13: | |||
movdqu (%rdi), %xmm8 | |||
movdqu 16(%rdi), %xmm15 | |||
movdqu 32(%rdi), %xmm12 | |||
pshufd $80, %xmm8, %xmm3 | |||
pshufd $250, %xmm8, %xmm8 | |||
pshufd $80, %xmm15, %xmm9 | |||
pshufd $250, %xmm15, %xmm15 | |||
pshufd $80, %xmm12, %xmm12 | |||
poly1305_blocks_sse2_14: | |||
movq 120(%rdi), %rax | |||
testb $48, %al | |||
je poly1305_blocks_sse2_15 | |||
testb $16, %al | |||
movd 56(%rdi), %xmm2 | |||
leaq 40(%rdi), %rax | |||
je poly1305_blocks_sse2_16 | |||
movdqu 60(%rdi), %xmm1 | |||
movdqu (%rax), %xmm4 | |||
movd %xmm2, %eax | |||
movd 76(%rdi), %xmm2 | |||
movaps %xmm1, %xmm7 | |||
movd %eax, %xmm5 | |||
punpckldq %xmm4, %xmm7 | |||
punpckhdq %xmm4, %xmm1 | |||
punpcklqdq %xmm5, %xmm2 | |||
jmp poly1305_blocks_sse2_17 | |||
poly1305_blocks_sse2_16: | |||
movdqu (%rax), %xmm1 | |||
movl $1, %r8d | |||
movd %r8d, %xmm4 | |||
movaps %xmm1, %xmm7 | |||
punpckldq %xmm4, %xmm7 | |||
punpckhdq %xmm4, %xmm1 | |||
poly1305_blocks_sse2_17: | |||
pshufd $80, %xmm7, %xmm11 | |||
pshufd $80, %xmm1, %xmm4 | |||
pshufd $250, %xmm7, %xmm7 | |||
movaps %xmm11, 168(%rsp) | |||
pshufd $250, %xmm1, %xmm1 | |||
jmp poly1305_blocks_sse2_18 | |||
poly1305_blocks_sse2_15: | |||
movdqu 60(%rdi), %xmm1 | |||
movd 76(%rdi), %xmm2 | |||
pshufd $0, %xmm2, %xmm2 | |||
pshufd $0, %xmm1, %xmm11 | |||
pshufd $85, %xmm1, %xmm7 | |||
pshufd $170, %xmm1, %xmm4 | |||
movaps %xmm11, 168(%rsp) | |||
pshufd $255, %xmm1, %xmm1 | |||
poly1305_blocks_sse2_18: | |||
movaps %xmm1, %xmm14 | |||
movaps %xmm7, %xmm5 | |||
movaps %xmm4, %xmm13 | |||
movaps %xmm1, 264(%rsp) | |||
movaps %xmm2, %xmm1 | |||
cmpq $63, %rdx | |||
movq $(5), %r8 | |||
movd %r8, %xmm6 | |||
pshufd $68, %xmm6, %xmm6 | |||
pmuludq %xmm6, %xmm5 | |||
movaps %xmm4, 296(%rsp) | |||
pmuludq %xmm6, %xmm13 | |||
movaps %xmm2, 152(%rsp) | |||
pmuludq %xmm6, %xmm14 | |||
pmuludq %xmm6, %xmm1 | |||
movaps %xmm5, 88(%rsp) | |||
movaps %xmm13, 72(%rsp) | |||
movaps %xmm14, 56(%rsp) | |||
movaps %xmm1, 40(%rsp) | |||
jbe poly1305_blocks_sse2_19 | |||
movdqu 80(%rdi), %xmm1 | |||
movd 96(%rdi), %xmm2 | |||
movq %rdx, %rcx | |||
pshufd $0, %xmm2, %xmm2 | |||
movaps %xmm2, 24(%rsp) | |||
pmuludq %xmm6, %xmm2 | |||
pshufd $85, %xmm1, %xmm4 | |||
movaps %xmm4, 280(%rsp) | |||
pmuludq %xmm6, %xmm4 | |||
pshufd $255, %xmm1, %xmm13 | |||
pshufd $170, %xmm1, %xmm5 | |||
movaps 72(%rsp), %xmm14 | |||
movaps %xmm5, 216(%rsp) | |||
pmuludq %xmm6, %xmm5 | |||
movq %rsi, %rax | |||
movaps %xmm4, -24(%rsp) | |||
movaps %xmm13, %xmm4 | |||
pshufd $0, %xmm1, %xmm1 | |||
pmuludq %xmm6, %xmm4 | |||
movaps %xmm14, -8(%rsp) | |||
movaps %xmm5, 8(%rsp) | |||
movaps 168(%rsp), %xmm5 | |||
movaps %xmm1, 248(%rsp) | |||
movaps 56(%rsp), %xmm1 | |||
movaps %xmm4, 120(%rsp) | |||
movaps 40(%rsp), %xmm4 | |||
movaps %xmm13, 136(%rsp) | |||
movaps %xmm2, 200(%rsp) | |||
movaps %xmm1, 104(%rsp) | |||
movaps %xmm4, 184(%rsp) | |||
movaps %xmm5, 232(%rsp) | |||
jmp poly1305_blocks_sse2_20 | |||
.p2align 6 | |||
poly1305_blocks_sse2_20: | |||
movaps -24(%rsp), %xmm5 | |||
movaps %xmm8, %xmm13 | |||
subq $64, %rcx | |||
movaps 8(%rsp), %xmm4 | |||
movaps 120(%rsp), %xmm10 | |||
pmuludq %xmm12, %xmm5 | |||
pmuludq %xmm15, %xmm4 | |||
movaps 8(%rsp), %xmm2 | |||
pmuludq %xmm9, %xmm10 | |||
movaps 120(%rsp), %xmm11 | |||
movaps 200(%rsp), %xmm14 | |||
pmuludq %xmm12, %xmm2 | |||
paddq %xmm4, %xmm5 | |||
pmuludq %xmm15, %xmm11 | |||
movaps 120(%rsp), %xmm1 | |||
paddq %xmm10, %xmm5 | |||
pmuludq %xmm8, %xmm14 | |||
movaps 200(%rsp), %xmm10 | |||
movaps 200(%rsp), %xmm4 | |||
pmuludq %xmm12, %xmm1 | |||
movaps 248(%rsp), %xmm8 | |||
pmuludq %xmm15, %xmm10 | |||
paddq %xmm11, %xmm2 | |||
pmuludq %xmm12, %xmm4 | |||
paddq %xmm14, %xmm5 | |||
movaps 200(%rsp), %xmm11 | |||
movaps 248(%rsp), %xmm14 | |||
pmuludq %xmm15, %xmm8 | |||
pmuludq 248(%rsp), %xmm12 | |||
pmuludq %xmm9, %xmm11 | |||
paddq %xmm10, %xmm1 | |||
movaps 248(%rsp), %xmm10 | |||
pmuludq 280(%rsp), %xmm15 | |||
pmuludq %xmm3, %xmm14 | |||
paddq %xmm15, %xmm12 | |||
paddq %xmm8, %xmm4 | |||
pmuludq %xmm13, %xmm10 | |||
movq 24(%rax), %xmm15 | |||
movaps 248(%rsp), %xmm8 | |||
paddq %xmm11, %xmm2 | |||
movaps %xmm3, %xmm11 | |||
movaps 280(%rsp), %xmm3 | |||
paddq %xmm14, %xmm5 | |||
pmuludq %xmm9, %xmm8 | |||
paddq %xmm10, %xmm2 | |||
movq 16(%rax), %xmm14 | |||
movaps 280(%rsp), %xmm10 | |||
pmuludq %xmm9, %xmm3 | |||
pmuludq 216(%rsp), %xmm9 | |||
paddq %xmm9, %xmm12 | |||
paddq %xmm8, %xmm1 | |||
movq (%rax), %xmm8 | |||
pmuludq %xmm11, %xmm10 | |||
paddq %xmm3, %xmm4 | |||
movaps 216(%rsp), %xmm3 | |||
punpcklqdq %xmm14, %xmm8 | |||
movaps 280(%rsp), %xmm14 | |||
pmuludq %xmm13, %xmm3 | |||
paddq %xmm10, %xmm2 | |||
movq 8(%rax), %xmm10 | |||
pmuludq %xmm13, %xmm14 | |||
pmuludq 136(%rsp), %xmm13 | |||
paddq %xmm13, %xmm12 | |||
punpcklqdq %xmm15, %xmm10 | |||
movaps %xmm10, %xmm9 | |||
movaps 216(%rsp), %xmm15 | |||
paddq %xmm3, %xmm4 | |||
psllq $12, %xmm9 | |||
movaps %xmm0, %xmm3 | |||
paddq %xmm14, %xmm1 | |||
pmuludq %xmm11, %xmm15 | |||
pand %xmm8, %xmm3 | |||
movaps 136(%rsp), %xmm14 | |||
movaps %xmm3, -40(%rsp) | |||
movaps %xmm8, %xmm3 | |||
movdqu 48(%rax), %xmm13 | |||
psrlq $52, %xmm8 | |||
pmuludq %xmm11, %xmm14 | |||
paddq %xmm15, %xmm1 | |||
por %xmm9, %xmm8 | |||
pmuludq 24(%rsp), %xmm11 | |||
paddq %xmm11, %xmm12 | |||
movdqu 32(%rax), %xmm11 | |||
movaps %xmm10, %xmm9 | |||
psrlq $40, %xmm10 | |||
pand %xmm0, %xmm8 | |||
movaps %xmm11, %xmm15 | |||
paddq %xmm14, %xmm4 | |||
xorps %xmm14, %xmm14 | |||
punpckldq %xmm13, %xmm15 | |||
psrlq $14, %xmm9 | |||
addq $64, %rax | |||
pand %xmm0, %xmm9 | |||
psrlq $26, %xmm3 | |||
cmpq $63, %rcx | |||
por 312(%rsp), %xmm10 | |||
movaps %xmm13, -72(%rsp) | |||
movaps %xmm15, %xmm13 | |||
punpckldq %xmm14, %xmm13 | |||
punpckhdq -72(%rsp), %xmm11 | |||
movaps %xmm13, -56(%rsp) | |||
movaps %xmm11, %xmm13 | |||
punpckhdq %xmm14, %xmm11 | |||
pand %xmm0, %xmm3 | |||
psllq $18, %xmm11 | |||
punpckhdq %xmm14, %xmm15 | |||
punpckldq %xmm14, %xmm13 | |||
paddq %xmm11, %xmm4 | |||
movaps -8(%rsp), %xmm11 | |||
psllq $6, %xmm15 | |||
psllq $12, %xmm13 | |||
movaps 88(%rsp), %xmm14 | |||
paddq %xmm15, %xmm2 | |||
pmuludq %xmm10, %xmm11 | |||
paddq %xmm13, %xmm1 | |||
movaps -8(%rsp), %xmm13 | |||
pmuludq %xmm10, %xmm14 | |||
paddq -56(%rsp), %xmm5 | |||
paddq 312(%rsp), %xmm12 | |||
pmuludq %xmm9, %xmm13 | |||
movaps 104(%rsp), %xmm15 | |||
paddq %xmm11, %xmm2 | |||
movaps 184(%rsp), %xmm11 | |||
paddq %xmm14, %xmm5 | |||
movaps 104(%rsp), %xmm14 | |||
pmuludq %xmm9, %xmm15 | |||
pmuludq %xmm10, %xmm11 | |||
paddq %xmm13, %xmm5 | |||
movaps 104(%rsp), %xmm13 | |||
pmuludq %xmm10, %xmm14 | |||
pmuludq 232(%rsp), %xmm10 | |||
paddq %xmm10, %xmm12 | |||
pmuludq %xmm8, %xmm13 | |||
paddq %xmm15, %xmm2 | |||
movaps %xmm8, %xmm10 | |||
paddq %xmm11, %xmm4 | |||
pmuludq %xmm7, %xmm10 | |||
movaps 232(%rsp), %xmm11 | |||
movaps 184(%rsp), %xmm15 | |||
paddq %xmm14, %xmm1 | |||
pmuludq %xmm9, %xmm11 | |||
paddq %xmm13, %xmm5 | |||
movaps 184(%rsp), %xmm13 | |||
movaps 184(%rsp), %xmm14 | |||
pmuludq %xmm3, %xmm15 | |||
pmuludq %xmm9, %xmm13 | |||
paddq %xmm11, %xmm4 | |||
pmuludq %xmm8, %xmm14 | |||
movaps 232(%rsp), %xmm11 | |||
paddq %xmm10, %xmm4 | |||
paddq %xmm15, %xmm5 | |||
pmuludq %xmm7, %xmm9 | |||
pmuludq %xmm8, %xmm11 | |||
paddq %xmm13, %xmm1 | |||
movaps 232(%rsp), %xmm13 | |||
movaps 296(%rsp), %xmm10 | |||
paddq %xmm14, %xmm2 | |||
pmuludq 296(%rsp), %xmm8 | |||
movaps -40(%rsp), %xmm14 | |||
pmuludq %xmm3, %xmm13 | |||
paddq %xmm9, %xmm12 | |||
paddq %xmm11, %xmm1 | |||
movaps %xmm3, %xmm11 | |||
paddq %xmm8, %xmm12 | |||
movaps 232(%rsp), %xmm15 | |||
pmuludq %xmm7, %xmm11 | |||
pmuludq %xmm3, %xmm10 | |||
paddq %xmm13, %xmm2 | |||
movaps %xmm14, %xmm13 | |||
movaps 296(%rsp), %xmm9 | |||
pmuludq %xmm14, %xmm15 | |||
pmuludq 264(%rsp), %xmm3 | |||
paddq %xmm11, %xmm1 | |||
pmuludq %xmm7, %xmm13 | |||
paddq %xmm3, %xmm12 | |||
movaps 264(%rsp), %xmm11 | |||
paddq %xmm10, %xmm4 | |||
pmuludq %xmm14, %xmm9 | |||
paddq %xmm15, %xmm5 | |||
pmuludq %xmm14, %xmm11 | |||
movaps %xmm5, %xmm8 | |||
paddq %xmm13, %xmm2 | |||
psrlq $26, %xmm8 | |||
paddq %xmm9, %xmm1 | |||
pand %xmm0, %xmm5 | |||
pmuludq 152(%rsp), %xmm14 | |||
paddq %xmm14, %xmm12 | |||
paddq %xmm8, %xmm2 | |||
paddq %xmm11, %xmm4 | |||
movaps %xmm2, %xmm9 | |||
movaps %xmm2, %xmm8 | |||
movaps %xmm4, %xmm3 | |||
psrlq $26, %xmm9 | |||
pand %xmm0, %xmm4 | |||
psrlq $26, %xmm3 | |||
paddq %xmm9, %xmm1 | |||
pand %xmm0, %xmm8 | |||
paddq %xmm3, %xmm12 | |||
movaps %xmm1, %xmm10 | |||
movaps %xmm1, %xmm9 | |||
movaps %xmm12, %xmm3 | |||
psrlq $26, %xmm10 | |||
pand %xmm0, %xmm12 | |||
psrlq $26, %xmm3 | |||
paddq %xmm10, %xmm4 | |||
pand %xmm0, %xmm9 | |||
pmuludq %xmm6, %xmm3 | |||
movaps %xmm4, %xmm1 | |||
movaps %xmm4, %xmm15 | |||
psrlq $26, %xmm1 | |||
pand %xmm0, %xmm15 | |||
paddq %xmm1, %xmm12 | |||
paddq %xmm3, %xmm5 | |||
movaps %xmm5, %xmm2 | |||
movaps %xmm5, %xmm3 | |||
psrlq $26, %xmm2 | |||
pand %xmm0, %xmm3 | |||
paddq %xmm2, %xmm8 | |||
ja poly1305_blocks_sse2_20 | |||
leaq -64(%rdx), %rax | |||
andl $63, %edx | |||
andq $-64, %rax | |||
leaq 64(%rsi,%rax), %rsi | |||
poly1305_blocks_sse2_19: | |||
cmpq $31, %rdx | |||
jbe poly1305_blocks_sse2_21 | |||
movaps 56(%rsp), %xmm11 | |||
movaps %xmm15, %xmm1 | |||
movaps %xmm15, %xmm14 | |||
movaps 72(%rsp), %xmm5 | |||
movaps %xmm12, %xmm4 | |||
movaps %xmm15, %xmm10 | |||
movaps 88(%rsp), %xmm2 | |||
pmuludq %xmm11, %xmm14 | |||
movaps %xmm8, %xmm15 | |||
pmuludq %xmm5, %xmm1 | |||
movaps 40(%rsp), %xmm13 | |||
testq %rsi, %rsi | |||
pmuludq %xmm12, %xmm2 | |||
pmuludq %xmm12, %xmm5 | |||
pmuludq %xmm11, %xmm4 | |||
paddq %xmm1, %xmm2 | |||
pmuludq %xmm9, %xmm11 | |||
movaps %xmm12, %xmm1 | |||
paddq %xmm14, %xmm5 | |||
pmuludq %xmm13, %xmm15 | |||
movaps %xmm9, %xmm14 | |||
pmuludq %xmm13, %xmm14 | |||
pmuludq %xmm13, %xmm1 | |||
paddq %xmm11, %xmm2 | |||
movaps 168(%rsp), %xmm11 | |||
pmuludq %xmm10, %xmm13 | |||
paddq %xmm15, %xmm2 | |||
movaps %xmm9, %xmm15 | |||
paddq %xmm14, %xmm5 | |||
pmuludq %xmm11, %xmm12 | |||
movaps %xmm3, %xmm14 | |||
pmuludq %xmm11, %xmm14 | |||
movaps %xmm13, 248(%rsp) | |||
movaps %xmm10, %xmm13 | |||
pmuludq %xmm7, %xmm15 | |||
paddq 248(%rsp), %xmm4 | |||
pmuludq %xmm11, %xmm13 | |||
pmuludq %xmm7, %xmm10 | |||
paddq %xmm14, %xmm2 | |||
movaps %xmm13, 280(%rsp) | |||
movaps %xmm8, %xmm13 | |||
pmuludq %xmm11, %xmm13 | |||
paddq %xmm10, %xmm12 | |||
movaps 296(%rsp), %xmm10 | |||
paddq 280(%rsp), %xmm1 | |||
pmuludq %xmm9, %xmm11 | |||
pmuludq 296(%rsp), %xmm9 | |||
pmuludq %xmm3, %xmm10 | |||
paddq %xmm9, %xmm12 | |||
paddq %xmm13, %xmm5 | |||
movaps %xmm3, %xmm13 | |||
paddq %xmm15, %xmm1 | |||
pmuludq %xmm7, %xmm13 | |||
paddq %xmm11, %xmm4 | |||
movaps 296(%rsp), %xmm11 | |||
pmuludq %xmm8, %xmm7 | |||
pmuludq %xmm8, %xmm11 | |||
pmuludq 264(%rsp), %xmm8 | |||
paddq %xmm8, %xmm12 | |||
paddq %xmm13, %xmm5 | |||
paddq %xmm7, %xmm4 | |||
movaps 264(%rsp), %xmm7 | |||
paddq %xmm11, %xmm1 | |||
paddq %xmm10, %xmm4 | |||
pmuludq %xmm3, %xmm7 | |||
pmuludq 152(%rsp), %xmm3 | |||
paddq %xmm3, %xmm12 | |||
paddq %xmm7, %xmm1 | |||
je poly1305_blocks_sse2_22 | |||
movdqu (%rsi), %xmm7 | |||
xorps %xmm3, %xmm3 | |||
paddq 312(%rsp), %xmm12 | |||
movdqu 16(%rsi), %xmm8 | |||
movaps %xmm7, %xmm9 | |||
punpckldq %xmm8, %xmm9 | |||
punpckhdq %xmm8, %xmm7 | |||
movaps %xmm9, %xmm10 | |||
movaps %xmm7, %xmm8 | |||
punpckldq %xmm3, %xmm10 | |||
punpckhdq %xmm3, %xmm9 | |||
punpckhdq %xmm3, %xmm7 | |||
punpckldq %xmm3, %xmm8 | |||
movaps %xmm8, %xmm3 | |||
psllq $6, %xmm9 | |||
paddq %xmm10, %xmm2 | |||
psllq $12, %xmm3 | |||
paddq %xmm9, %xmm5 | |||
psllq $18, %xmm7 | |||
paddq %xmm3, %xmm4 | |||
paddq %xmm7, %xmm1 | |||
poly1305_blocks_sse2_22: | |||
movaps %xmm2, %xmm8 | |||
movaps %xmm1, %xmm3 | |||
movaps %xmm1, %xmm15 | |||
psrlq $26, %xmm8 | |||
pand %xmm0, %xmm2 | |||
pand %xmm0, %xmm15 | |||
psrlq $26, %xmm3 | |||
paddq %xmm5, %xmm8 | |||
paddq %xmm12, %xmm3 | |||
movaps %xmm8, %xmm9 | |||
pand %xmm0, %xmm8 | |||
movaps %xmm3, %xmm1 | |||
psrlq $26, %xmm9 | |||
movaps %xmm3, %xmm12 | |||
psrlq $26, %xmm1 | |||
paddq %xmm4, %xmm9 | |||
pand %xmm0, %xmm12 | |||
pmuludq %xmm1, %xmm6 | |||
movaps %xmm9, %xmm3 | |||
pand %xmm0, %xmm9 | |||
psrlq $26, %xmm3 | |||
paddq %xmm3, %xmm15 | |||
paddq %xmm6, %xmm2 | |||
movaps %xmm15, %xmm3 | |||
pand %xmm0, %xmm15 | |||
movaps %xmm2, %xmm1 | |||
psrlq $26, %xmm3 | |||
psrlq $26, %xmm1 | |||
paddq %xmm3, %xmm12 | |||
movaps %xmm0, %xmm3 | |||
paddq %xmm1, %xmm8 | |||
pand %xmm2, %xmm3 | |||
poly1305_blocks_sse2_21: | |||
testq %rsi, %rsi | |||
je poly1305_blocks_sse2_23 | |||
pshufd $8, %xmm3, %xmm3 | |||
pshufd $8, %xmm8, %xmm8 | |||
pshufd $8, %xmm9, %xmm9 | |||
pshufd $8, %xmm15, %xmm15 | |||
pshufd $8, %xmm12, %xmm12 | |||
punpcklqdq %xmm8, %xmm3 | |||
punpcklqdq %xmm15, %xmm9 | |||
movdqu %xmm3, (%rdi) | |||
movdqu %xmm9, 16(%rdi) | |||
movq %xmm12, 32(%rdi) | |||
jmp poly1305_blocks_sse2_10 | |||
poly1305_blocks_sse2_23: | |||
movaps %xmm3, %xmm0 | |||
movaps %xmm8, %xmm4 | |||
movaps %xmm9, %xmm2 | |||
psrldq $8, %xmm0 | |||
movaps %xmm15, %xmm10 | |||
paddq %xmm0, %xmm3 | |||
psrldq $8, %xmm4 | |||
movaps %xmm12, %xmm0 | |||
movd %xmm3, %edx | |||
paddq %xmm4, %xmm8 | |||
psrldq $8, %xmm2 | |||
movl %edx, %ecx | |||
movd %xmm8, %eax | |||
paddq %xmm2, %xmm9 | |||
shrl $26, %ecx | |||
psrldq $8, %xmm10 | |||
andl $67108863, %edx | |||
addl %ecx, %eax | |||
movd %xmm9, %ecx | |||
paddq %xmm10, %xmm15 | |||
movl %eax, %r9d | |||
shrl $26, %eax | |||
psrldq $8, %xmm0 | |||
addl %ecx, %eax | |||
movd %xmm15, %ecx | |||
paddq %xmm0, %xmm12 | |||
movl %eax, %esi | |||
andl $67108863, %r9d | |||
movd %xmm12, %r10d | |||
shrl $26, %esi | |||
andl $67108863, %eax | |||
addl %ecx, %esi | |||
salq $8, %rax | |||
movl %r9d, %ecx | |||
shrl $18, %r9d | |||
movl %esi, %r8d | |||
shrl $26, %esi | |||
andl $67108863, %r8d | |||
addl %r10d, %esi | |||
orq %r9, %rax | |||
salq $16, %rsi | |||
movq %r8, %r9 | |||
shrl $10, %r8d | |||
salq $26, %rcx | |||
orq %r8, %rsi | |||
salq $34, %r9 | |||
orq %rdx, %rcx | |||
movq %rsi, %r11 | |||
shrq $42, %rsi | |||
movabsq $17592186044415, %rdx | |||
orq %r9, %rax | |||
movabsq $4398046511103, %r8 | |||
andq %rdx, %rcx | |||
andq %rdx, %rax | |||
andq %r8, %r11 | |||
leaq (%rsi,%rsi,4), %rsi | |||
addq %rsi, %rcx | |||
movq %rcx, %r10 | |||
shrq $44, %rcx | |||
addq %rcx, %rax | |||
andq %rdx, %r10 | |||
movq %rax, %r9 | |||
shrq $44, %rax | |||
addq %r11, %rax | |||
andq %rdx, %r9 | |||
movabsq $-4398046511104, %r11 | |||
movq %rax, %rcx | |||
andq %r8, %rcx | |||
shrq $42, %rax | |||
leaq (%rax,%rax,4), %rsi | |||
addq %rcx, %r11 | |||
addq %r10, %rsi | |||
movq %rsi, %r8 | |||
shrq $44, %rsi | |||
andq %rdx, %r8 | |||
addq %r9, %rsi | |||
leaq 5(%r8), %r9 | |||
movq %r9, %rbx | |||
andq %rdx, %r9 | |||
shrq $44, %rbx | |||
addq %rsi, %rbx | |||
movq %rbx, %rax | |||
andq %rbx, %rdx | |||
shrq $44, %rax | |||
addq %rax, %r11 | |||
movq %r11, %rax | |||
shrq $63, %rax | |||
decq %rax | |||
movq %rax, %r10 | |||
andq %rax, %r9 | |||
andq %rax, %rdx | |||
notq %r10 | |||
andq %r11, %rax | |||
andq %r10, %r8 | |||
andq %r10, %rsi | |||
andq %r10, %rcx | |||
orq %r9, %r8 | |||
orq %rdx, %rsi | |||
orq %rax, %rcx | |||
movq %r8, (%rdi) | |||
movq %rsi, 8(%rdi) | |||
movq %rcx, 16(%rdi) | |||
poly1305_blocks_sse2_10: | |||
movq -8(%rbp), %rbx | |||
leave | |||
ret | |||
FN_END poly1305_blocks_sse2 | |||
GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2 | |||
poly1305_finish_ext_sse2_local: | |||
pushq %r12 | |||
movq %rcx, %r12 | |||
pushq %rbp | |||
movq %rdx, %rbp | |||
pushq %rbx | |||
movq %rdi, %rbx | |||
subq $32, %rsp | |||
testq %rdx, %rdx | |||
je poly1305_finish_ext_sse2_27 | |||
xorl %eax, %eax | |||
movq %rsp, %rdi | |||
movl $8, %ecx | |||
rep stosl | |||
subq %rsp, %rsi | |||
testb $16, %dl | |||
movq %rsp, %rax | |||
je poly1305_finish_ext_sse2_28 | |||
movdqu (%rsp,%rsi), %xmm0 | |||
addq $16, %rax | |||
movaps %xmm0, (%rsp) | |||
poly1305_finish_ext_sse2_28: | |||
testb $8, %bpl | |||
je poly1305_finish_ext_sse2_29 | |||
movq (%rax,%rsi), %rdx | |||
movq %rdx, (%rax) | |||
addq $8, %rax | |||
poly1305_finish_ext_sse2_29: | |||
testb $4, %bpl | |||
je poly1305_finish_ext_sse2_30 | |||
movl (%rax,%rsi), %edx | |||
movl %edx, (%rax) | |||
addq $4, %rax | |||
poly1305_finish_ext_sse2_30: | |||
testb $2, %bpl | |||
je poly1305_finish_ext_sse2_31 | |||
movw (%rax,%rsi), %dx | |||
movw %dx, (%rax) | |||
addq $2, %rax | |||
poly1305_finish_ext_sse2_31: | |||
testb $1, %bpl | |||
je poly1305_finish_ext_sse2_32 | |||
movb (%rax,%rsi), %dl | |||
movb %dl, (%rax) | |||
poly1305_finish_ext_sse2_32: | |||
cmpq $16, %rbp | |||
je poly1305_finish_ext_sse2_33 | |||
movb $1, (%rsp,%rbp) | |||
poly1305_finish_ext_sse2_33: | |||
cmpq $16, %rbp | |||
movl $32, %edx | |||
movq %rsp, %rsi | |||
sbbq %rax, %rax | |||
movq %rbx, %rdi | |||
andl $4, %eax | |||
addq $4, %rax | |||
orq %rax, 120(%rbx) | |||
call poly1305_blocks_sse2_local | |||
poly1305_finish_ext_sse2_27: | |||
movq 120(%rbx), %rax | |||
testb $1, %al | |||
je poly1305_finish_ext_sse2_35 | |||
decq %rbp | |||
cmpq $15, %rbp | |||
jbe poly1305_finish_ext_sse2_36 | |||
orq $16, %rax | |||
jmp poly1305_finish_ext_sse2_40 | |||
poly1305_finish_ext_sse2_36: | |||
orq $32, %rax | |||
poly1305_finish_ext_sse2_40: | |||
movq %rax, 120(%rbx) | |||
movl $32, %edx | |||
xorl %esi, %esi | |||
movq %rbx, %rdi | |||
call poly1305_blocks_sse2_local | |||
poly1305_finish_ext_sse2_35: | |||
movq 8(%rbx), %rax | |||
movq 112(%rbx), %rsi | |||
movq %rax, %rdx | |||
movq %rax, %rcx | |||
movq 16(%rbx), %rax | |||
shrq $20, %rcx | |||
salq $44, %rdx | |||
orq (%rbx), %rdx | |||
salq $24, %rax | |||
orq %rcx, %rax | |||
movq 104(%rbx), %rcx | |||
addq %rcx, %rdx | |||
adcq %rsi, %rax | |||
xorps %xmm0, %xmm0 | |||
movdqu %xmm0, (%rbx) | |||
movdqu %xmm0, 16(%rbx) | |||
movdqu %xmm0, 32(%rbx) | |||
movdqu %xmm0, 48(%rbx) | |||
movdqu %xmm0, 64(%rbx) | |||
movdqu %xmm0, 80(%rbx) | |||
movdqu %xmm0, 96(%rbx) | |||
movdqu %xmm0, 112(%rbx) | |||
movq %rdx, (%r12) | |||
movq %rax, 8(%r12) | |||
addq $32, %rsp | |||
popq %rbx | |||
popq %rbp | |||
popq %r12 | |||
ret | |||
FN_END poly1305_finish_ext_sse2 | |||
GLOBAL_HIDDEN_FN poly1305_auth_sse2 | |||
cmpq $128, %rdx | |||
jb poly1305_auth_x86_local | |||
pushq %rbp | |||
movq %rsp, %rbp | |||
pushq %r14 | |||
pushq %r13 | |||
movq %rdi, %r13 | |||
pushq %r12 | |||
movq %rsi, %r12 | |||
movq %rcx, %rsi | |||
pushq %rbx | |||
movq %rdx, %rbx | |||
andq $-64, %rsp | |||
movq %rbx, %r14 | |||
addq $-128, %rsp | |||
movq %rsp, %rdi | |||
call poly1305_init_ext_sse2_local | |||
andq $-32, %r14 | |||
je poly1305_auth_sse2_42 | |||
movq %r12, %rsi | |||
movq %r14, %rdx | |||
movq %rsp, %rdi | |||
call poly1305_blocks_sse2_local | |||
addq %r14, %r12 | |||
subq %r14, %rbx | |||
poly1305_auth_sse2_42: | |||
movq %r13, %rcx | |||
movq %rbx, %rdx | |||
movq %r12, %rsi | |||
movq %rsp, %rdi | |||
call poly1305_finish_ext_sse2_local | |||
leaq -32(%rbp), %rsp | |||
popq %rbx | |||
popq %r12 | |||
popq %r13 | |||
popq %r14 | |||
popq %rbp | |||
ret | |||
FN_END poly1305_auth_sse2 | |||