diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-07 22:10:07 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-07 22:28:36 +0000 |
commit | 06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f (patch) | |
tree | a9d9d28c941840486c6697362dac219204672fda /src/libcryptobox/poly1305 | |
parent | 1e2ff82baa69251c79576609c2a94bd0c006cd72 (diff) | |
download | rspamd-06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f.tar.gz rspamd-06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f.zip |
Use optimized version of poly1305.
Diffstat (limited to 'src/libcryptobox/poly1305')
-rw-r--r-- | src/libcryptobox/poly1305/README.md | 112 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/avx.S | 875 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/avx2.S | 1093 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/constants.S | 21 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-16.h | 202 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-32.h | 219 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-64.h | 224 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna-8.h | 186 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna.c | 201 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305-donna.h | 20 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305.c | 222 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305.h | 38 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/ref-32.c | 237 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/ref-64.c | 231 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/sse2.S | 966 |
15 files changed, 3683 insertions, 1164 deletions
diff --git a/src/libcryptobox/poly1305/README.md b/src/libcryptobox/poly1305/README.md deleted file mode 100644 index 510e0f781..000000000 --- a/src/libcryptobox/poly1305/README.md +++ /dev/null @@ -1,112 +0,0 @@ -"A state-of-the-art message-authentication code"
-
-# ABOUT
-
-See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)
-
-These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.
-Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)
-
-# BUILDING
-
-## Default
-
-If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based
-on what it can tell the compiler supports
-
- gcc poly1305-donna.c -O3 -o poly1305.o
-
-## Selecting a specific version
-
- gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS
-
-Where `-DPOLY1305_XXBITS` is one of
-
- * `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions
- * `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions
- * `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions
- * `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions
-
-8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due
-to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which
-is not the intention of this project.
-
-# USAGE
-
-See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:
-
-The poly1305_auth function, viewed as a function of the message for a uniform random key, is
-designed to meet the standard notion of unforgeability after a single message. After the sender
-authenticates one message, an attacker cannot find authenticators for any other messages.
-
-The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.
-Authenticators for two messages under the same key should be expected to reveal enough information
-to allow forgeries of authenticators on other messages.
-
-## Functions
-
-`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support
-every underlying platform specific implementation. It should be size_t aligned, which should be handled already
-with the size_t member `aligner`.
-
-`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`
-
-where
-
-`key` is the 32 byte key that is **only used for this message and is discarded immediately after**
-
-`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`
-
-where `m` is a pointer to the message fragment to be processed, and
-
-`bytes` is the length of the message fragment
-
-`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`
-
-where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying
-implementation will zero out `ctx`.
-
-`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`
-
-where `mac` is the buffer which receives the 16 byte authenticator,
-
-`m` is a pointer to the message to be processed,
-
-`bytes` is the number of bytes in the message, and
-
-`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.
-
-`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`
-
-where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not
-
-`int poly1305_power_on_self_test(void);`
-
-tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if
-any tests fail.
-
-## Example
-
-### Simple
-
- #include "poly1305-donna.h"
-
- unsigned char key[32] = {...}, mac[16];
- unsigned char msg[] = {...};
-
- poly1305_auth(mac, msg, msglen, key);
-
-### Full
-
-[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing
-the correct results, compute an authenticator, and test it against an expected value.
-
-# LICENSE
-
-[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN
-
-
-# NAMESAKE
-
-I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence
-the name.
\ No newline at end of file diff --git a/src/libcryptobox/poly1305/avx.S b/src/libcryptobox/poly1305/avx.S new file mode 100644 index 000000000..a5c4ccf26 --- /dev/null +++ b/src/libcryptobox/poly1305/avx.S @@ -0,0 +1,875 @@ +#include "../chacha20/macro.S" +#include "constants.S" + +SECTION_TEXT + +GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0 +movl $32, %eax +ret +FN_END poly1305_block_size_avx + +GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1 +poly1305_init_ext_avx_local: +pushq %r15 +pushq %r14 +pushq %r13 +pushq %r12 +pushq %rbp +pushq %rbx +movq %rdi, %rbp +testq %rdx, %rdx +movq $-1, %rax +cmovne %rdx, %rax +movq %rax, -16(%rsp) +vpxor %xmm0, %xmm0, %xmm0 +vmovdqu %xmm0, (%rdi) +vmovdqu %xmm0, 16(%rdi) +vmovdqu %xmm0, 32(%rdi) +movq (%rsi), %r9 +movq 8(%rsi), %r8 +movabsq $17575274610687, %r10 +andq %r9, %r10 +shrq $44, %r9 +movq %r8, %rax +salq $20, %rax +orq %rax, %r9 +movabsq $17592181915647, %rax +andq %rax, %r9 +shrq $24, %r8 +movabsq $68719475727, %rax +andq %rax, %r8 +leaq 40(%rdi), %r15 +movl %r10d, %eax +andl $67108863, %eax +movl %eax, 40(%rdi) +movl %r9d, %edx +sall $18, %edx +movq %r10, %rax +shrq $26, %rax +orl %edx, %eax +andl $67108863, %eax +movl %eax, 44(%rdi) +movq %r9, %rax +shrq $8, %rax +andl $67108863, %eax +movl %eax, 48(%rdi) +movq %r9, %rax +shrq $34, %rax +movl %r8d, %edx +sall $10, %edx +orl %edx, %eax +andl $67108863, %eax +movl %eax, 52(%rdi) +movq %r8, %rax +shrq $16, %rax +movl %eax, 56(%rdi) +movq 16(%rsi), %rax +movq %rax, 104(%rdi) +movq 24(%rsi), %rax +movq %rax, 112(%rdi) +movl $0, %ebx +.L7: +testq %rbx, %rbx +jne .L4 +leaq 60(%rbp), %r15 +cmpq $16, -16(%rsp) +ja .L6 +jmp .L5 +.L4: +cmpq $1, %rbx +jne .L6 +leaq 80(%rbp), %r15 +cmpq $95, -16(%rsp) +jbe .L5 +.L6: +leaq (%r8,%r8,4), %rsi +salq $2, %rsi +leaq (%r9,%r9), %rdi +movq %rdi, %rax +mulq %rsi +movq %rax, %r13 +movq %rdx, %r14 +movq %r10, %rax +mulq %r10 +addq %r13, %rax +adcq %r14, %rdx +movabsq $17592186044415, %rcx +movq %rax, -72(%rsp) +movq %rdx, -64(%rsp) +andq -72(%rsp), %rcx +leaq (%r10,%r10), %r11 +movq %r11, %rax +mulq %r9 +movq %rax, %r11 +movq %rdx, %r12 +movq %rsi, %rax +mulq %r8 +movq %rax, %r13 +movq %rdx, %r14 +addq %r11, %r13 +adcq %r12, %r14 +movq -72(%rsp), %rax +movq -64(%rsp), %rdx +shrdq $44, %rdx, %rax +movq %rax, -56(%rsp) +movq $0, -48(%rsp) +addq -56(%rsp), %r13 +adcq -48(%rsp), %r14 +movabsq $17592186044415, %rsi +andq %r13, %rsi +leaq (%r8,%r8), %rdi +movq %rdi, %rax +mulq %r10 +movq %rax, %r11 +movq %rdx, %r12 +movq %r9, %rax +mulq %r9 +addq %r11, %rax +adcq %r12, %rdx +shrdq $44, %r14, %r13 +movq %r13, -40(%rsp) +movq $0, -32(%rsp) +addq -40(%rsp), %rax +adcq -32(%rsp), %rdx +movabsq $4398046511103, %rdi +andq %rax, %rdi +shrdq $42, %rdx, %rax +leaq (%rax,%rax,4), %r8 +addq %rcx, %r8 +movabsq $17592186044415, %r10 +andq %r8, %r10 +shrq $44, %r8 +addq %rsi, %r8 +movabsq $17592186044415, %r9 +andq %r8, %r9 +shrq $44, %r8 +addq %rdi, %r8 +movl %r10d, %eax +andl $67108863, %eax +movl %eax, (%r15) +movl %r9d, %edx +sall $18, %edx +movq %r10, %rax +shrq $26, %rax +orl %edx, %eax +andl $67108863, %eax +movl %eax, 4(%r15) +movq %r9, %rax +shrq $8, %rax +andl $67108863, %eax +movl %eax, 8(%r15) +movl %r8d, %edx +sall $10, %edx +movq %r9, %rax +shrq $34, %rax +orl %edx, %eax +andl $67108863, %eax +movl %eax, 12(%r15) +movq %r8, %rax +shrq $16, %rax +movl %eax, 16(%r15) +addq $1, %rbx +cmpq $2, %rbx +jne .L7 +.L5: +movq $0, 120(%rbp) +popq %rbx +popq %rbp +popq %r12 +popq %r13 +popq %r14 +popq %r15 +ret +FN_END poly1305_init_ext_avx + + + +GLOBAL_HIDDEN_FN poly1305_blocks_avx +poly1305_blocks_avx_local: +pushq %rbp +movq %rsp, %rbp +pushq %rbx +andq $-64, %rsp +subq $200, %rsp +movl $(1 << 24), %eax +movl $((1 << 26) - 1), %r8d +movl $(5), %r9d +vmovd %eax, %xmm1 +vmovd %r8d, %xmm0 +vmovd %r9d, %xmm2 +vpshufd $68, %xmm1, %xmm1 +vpshufd $68, %xmm0, %xmm0 +vpshufd $68, %xmm2, %xmm2 +vmovdqa %xmm1, 152(%rsp) +vmovdqa %xmm2, 184(%rsp) +movq 120(%rdi), %rax +testb $4, %al +je .L12 +vpsrldq $8, %xmm1, %xmm1 +vmovdqa %xmm1, 152(%rsp) +.L12: +testb $8, %al +je .L13 +vpxor %xmm1, %xmm1, %xmm1 +vmovdqa %xmm1, 152(%rsp) +.L13: +testb $1, %al +jne .L14 +vmovq (%rsi), %xmm1 +vpinsrq $1, 16(%rsi), %xmm1, %xmm1 +vmovq 8(%rsi), %xmm3 +vpinsrq $1, 24(%rsi), %xmm3, %xmm2 +vpand %xmm0, %xmm1, %xmm7 +vpsrlq $26, %xmm1, %xmm12 +vpand %xmm0, %xmm12, %xmm12 +vpsllq $12, %xmm2, %xmm3 +vpsrlq $52, %xmm1, %xmm1 +vpor %xmm3, %xmm1, %xmm6 +vpand %xmm0, %xmm6, %xmm3 +vpsrlq $26, %xmm6, %xmm6 +vpand %xmm0, %xmm6, %xmm6 +vpsrlq $40, %xmm2, %xmm2 +vpor 152(%rsp), %xmm2, %xmm2 +addq $32, %rsi +subq $32, %rdx +orq $1, %rax +movq %rax, 120(%rdi) +jmp .L15 +.L14: +vmovdqu (%rdi), %xmm12 +vmovdqu 16(%rdi), %xmm6 +vmovdqu 32(%rdi), %xmm2 +vpshufd $80, %xmm12, %xmm7 +vpshufd $250, %xmm12, %xmm12 +vpshufd $80, %xmm6, %xmm3 +vpshufd $250, %xmm6, %xmm6 +vpshufd $80, %xmm2, %xmm2 +.L15: +movq 120(%rdi), %rax +testb $48, %al +je .L16 +testb $16, %al +je .L17 +vmovdqu 40(%rdi), %xmm1 +vmovd 56(%rdi), %xmm4 +vmovdqu 60(%rdi), %xmm5 +vpunpckldq %xmm1, %xmm5, %xmm11 +vpunpckhdq %xmm1, %xmm5, %xmm5 +vmovd 76(%rdi), %xmm1 +vpunpcklqdq %xmm4, %xmm1, %xmm4 +jmp .L18 +.L17: +movl $(1), %r8d +vmovdqu 40(%rdi), %xmm5 +vmovd 56(%rdi), %xmm4 +vmovd %r8d, %xmm1 +vpunpckldq %xmm1, %xmm5, %xmm11 +vpunpckhdq %xmm1, %xmm5, %xmm5 +.L18: +vpshufd $80, %xmm11, %xmm1 +vpshufd $250, %xmm11, %xmm11 +vpshufd $80, %xmm5, %xmm10 +vpshufd $250, %xmm5, %xmm5 +jmp .L19 +.L16: +vmovdqu 60(%rdi), %xmm5 +vpshufd $0, %xmm5, %xmm1 +vpshufd $85, %xmm5, %xmm11 +vpshufd $170, %xmm5, %xmm10 +vpshufd $255, %xmm5, %xmm5 +vmovd 76(%rdi), %xmm4 +vpshufd $0, %xmm4, %xmm4 +.L19: +vmovdqa %xmm11, 136(%rsp) +vpmuludq 184(%rsp), %xmm11, %xmm13 +vmovdqa %xmm13, 120(%rsp) +vmovdqa %xmm10, 104(%rsp) +vpmuludq 184(%rsp), %xmm10, %xmm13 +vmovdqa %xmm13, 88(%rsp) +vmovdqa %xmm5, 72(%rsp) +vpmuludq 184(%rsp), %xmm5, %xmm5 +vmovdqa %xmm5, 56(%rsp) +vmovdqa %xmm4, 40(%rsp) +vpmuludq 184(%rsp), %xmm4, %xmm4 +vmovdqa %xmm4, 24(%rsp) +cmpq $63, %rdx +jbe .L20 +vmovdqu 80(%rdi), %xmm4 +vpshufd $0, %xmm4, %xmm5 +vmovdqa %xmm5, 8(%rsp) +vpshufd $85, %xmm4, %xmm5 +vmovdqa %xmm5, -8(%rsp) +vpshufd $170, %xmm4, %xmm13 +vmovdqa %xmm13, -24(%rsp) +vpshufd $255, %xmm4, %xmm4 +vmovdqa %xmm4, %xmm10 +vmovdqa %xmm4, -40(%rsp) +vmovd 96(%rdi), %xmm4 +vpshufd $0, %xmm4, %xmm4 +vmovdqa %xmm4, %xmm8 +vmovdqa %xmm4, -56(%rsp) +vpmuludq 184(%rsp), %xmm5, %xmm4 +vmovdqa %xmm4, -72(%rsp) +vpmuludq 184(%rsp), %xmm13, %xmm4 +vmovdqa %xmm4, -88(%rsp) +vpmuludq 184(%rsp), %xmm10, %xmm4 +vmovdqa %xmm4, -104(%rsp) +vpmuludq 184(%rsp), %xmm8, %xmm4 +vmovdqa %xmm4, -120(%rsp) +leaq 32(%rsi), %rax +movq %rdx, %rcx +vmovdqa %xmm1, 168(%rsp) +jmp .L22 +.p2align 6 +nop +nop +nop +nop +.L22: +vpmuludq -72(%rsp), %xmm2, %xmm13 +vmovdqa -88(%rsp), %xmm5 +vpmuludq %xmm5, %xmm6, %xmm4 +vpmuludq %xmm5, %xmm2, %xmm11 +vmovdqa -104(%rsp), %xmm9 +vpmuludq %xmm9, %xmm6, %xmm5 +vpmuludq %xmm9, %xmm2, %xmm10 +vpaddq %xmm4, %xmm13, %xmm13 +vpmuludq %xmm9, %xmm3, %xmm4 +vmovdqa -120(%rsp), %xmm8 +vpmuludq %xmm8, %xmm2, %xmm9 +vpaddq %xmm5, %xmm11, %xmm11 +vmovdqa %xmm8, %xmm5 +vpmuludq %xmm8, %xmm12, %xmm8 +vpmuludq %xmm5, %xmm3, %xmm14 +vpaddq %xmm4, %xmm13, %xmm13 +vpmuludq %xmm5, %xmm6, %xmm4 +vmovdqa 8(%rsp), %xmm15 +vpmuludq %xmm15, %xmm6, %xmm5 +vpaddq %xmm8, %xmm13, %xmm13 +vpmuludq %xmm15, %xmm2, %xmm8 +vpaddq %xmm14, %xmm11, %xmm11 +vpmuludq %xmm15, %xmm7, %xmm14 +vpaddq %xmm4, %xmm10, %xmm10 +vpmuludq %xmm15, %xmm12, %xmm4 +vpaddq %xmm5, %xmm9, %xmm9 +vpmuludq %xmm15, %xmm3, %xmm5 +vmovdqa -8(%rsp), %xmm15 +vpmuludq %xmm15, %xmm3, %xmm2 +vpaddq %xmm14, %xmm13, %xmm13 +vpmuludq %xmm15, %xmm6, %xmm6 +vpaddq %xmm4, %xmm11, %xmm11 +vpmuludq %xmm15, %xmm7, %xmm4 +vpaddq %xmm5, %xmm10, %xmm10 +vmovq -32(%rax), %xmm5 +vpinsrq $1, -16(%rax), %xmm5, %xmm5 +vpmuludq %xmm15, %xmm12, %xmm14 +vpaddq %xmm2, %xmm9, %xmm9 +vmovdqa -24(%rsp), %xmm2 +vpmuludq %xmm2, %xmm12, %xmm15 +vpaddq %xmm6, %xmm8, %xmm8 +vpmuludq %xmm2, %xmm3, %xmm3 +vpaddq %xmm4, %xmm11, %xmm11 +vmovq -24(%rax), %xmm4 +vpinsrq $1, -8(%rax), %xmm4, %xmm6 +vpmuludq %xmm2, %xmm7, %xmm4 +vpaddq %xmm14, %xmm10, %xmm10 +vmovdqa -40(%rsp), %xmm1 +vpmuludq %xmm1, %xmm7, %xmm14 +vpaddq %xmm15, %xmm9, %xmm9 +vpand %xmm5, %xmm0, %xmm2 +vpmuludq %xmm1, %xmm12, %xmm12 +vpaddq %xmm3, %xmm8, %xmm8 +vpsrlq $26, %xmm5, %xmm3 +vpand %xmm3, %xmm0, %xmm3 +vpmuludq -56(%rsp), %xmm7, %xmm7 +vpaddq %xmm4, %xmm10, %xmm10 +vpsllq $12, %xmm6, %xmm15 +vpsrlq $52, %xmm5, %xmm4 +vpor %xmm15, %xmm4, %xmm4 +vpaddq %xmm14, %xmm9, %xmm9 +vpsrlq $14, %xmm6, %xmm5 +vpand %xmm5, %xmm0, %xmm5 +vpaddq %xmm12, %xmm8, %xmm8 +vpand %xmm4, %xmm0, %xmm4 +vpaddq %xmm7, %xmm8, %xmm8 +vpsrlq $40, %xmm6, %xmm6 +vpor 152(%rsp), %xmm6, %xmm6 +vmovdqu (%rax), %xmm12 +vmovdqu 16(%rax), %xmm7 +vpunpckldq %xmm7, %xmm12, %xmm15 +vpunpckhdq %xmm7, %xmm12, %xmm7 +vpxor %xmm14, %xmm14, %xmm14 +vpunpckldq %xmm14, %xmm15, %xmm12 +vpunpckhdq %xmm14, %xmm15, %xmm15 +vpunpckldq %xmm14, %xmm7, %xmm14 +vpxor %xmm1, %xmm1, %xmm1 +vpunpckhdq %xmm1, %xmm7, %xmm7 +vpsllq $6, %xmm15, %xmm15 +vpsllq $12, %xmm14, %xmm14 +vpsllq $18, %xmm7, %xmm7 +vpaddq %xmm12, %xmm13, %xmm12 +vpaddq %xmm15, %xmm11, %xmm15 +vpaddq %xmm14, %xmm10, %xmm14 +vpaddq %xmm7, %xmm9, %xmm7 +vpaddq 152(%rsp), %xmm8, %xmm8 +vpmuludq 120(%rsp), %xmm6, %xmm13 +vmovdqa 88(%rsp), %xmm10 +vpmuludq %xmm10, %xmm5, %xmm9 +vpmuludq %xmm10, %xmm6, %xmm11 +vmovdqa 56(%rsp), %xmm1 +vpmuludq %xmm1, %xmm5, %xmm10 +vpaddq %xmm13, %xmm12, %xmm12 +vpmuludq %xmm1, %xmm6, %xmm13 +vpaddq %xmm9, %xmm12, %xmm12 +vpmuludq %xmm1, %xmm4, %xmm9 +vpaddq %xmm11, %xmm15, %xmm15 +vmovdqa 24(%rsp), %xmm1 +vpmuludq %xmm1, %xmm6, %xmm11 +vpaddq %xmm10, %xmm15, %xmm10 +vpmuludq %xmm1, %xmm3, %xmm15 +vpaddq %xmm13, %xmm14, %xmm14 +vpmuludq %xmm1, %xmm4, %xmm13 +vpaddq %xmm9, %xmm12, %xmm9 +vpmuludq %xmm1, %xmm5, %xmm12 +vpaddq %xmm11, %xmm7, %xmm7 +vpmuludq 168(%rsp), %xmm5, %xmm11 +vpaddq %xmm15, %xmm9, %xmm9 +vpmuludq 168(%rsp), %xmm6, %xmm6 +vpaddq %xmm13, %xmm10, %xmm10 +vpmuludq 168(%rsp), %xmm2, %xmm15 +vpaddq %xmm12, %xmm14, %xmm14 +vpmuludq 168(%rsp), %xmm3, %xmm13 +vpaddq %xmm11, %xmm7, %xmm11 +vpmuludq 168(%rsp), %xmm4, %xmm12 +vpaddq %xmm6, %xmm8, %xmm6 +vmovdqa 136(%rsp), %xmm8 +vpmuludq %xmm8, %xmm4, %xmm7 +vpaddq %xmm15, %xmm9, %xmm9 +vpmuludq %xmm8, %xmm5, %xmm5 +vpaddq %xmm13, %xmm10, %xmm10 +vpmuludq %xmm8, %xmm2, %xmm15 +vpaddq %xmm12, %xmm14, %xmm14 +vpmuludq %xmm8, %xmm3, %xmm8 +vpaddq %xmm7, %xmm11, %xmm11 +vmovdqa 104(%rsp), %xmm7 +vpmuludq %xmm7, %xmm3, %xmm13 +vpaddq %xmm5, %xmm6, %xmm6 +vpmuludq %xmm7, %xmm4, %xmm4 +vpaddq %xmm15, %xmm10, %xmm10 +vpmuludq %xmm7, %xmm2, %xmm15 +vpaddq %xmm8, %xmm14, %xmm14 +vmovdqa 72(%rsp), %xmm5 +vpmuludq %xmm5, %xmm2, %xmm7 +vpaddq %xmm13, %xmm11, %xmm11 +vpmuludq %xmm5, %xmm3, %xmm3 +vpaddq %xmm4, %xmm6, %xmm6 +vpmuludq 40(%rsp), %xmm2, %xmm2 +vpaddq %xmm15, %xmm14, %xmm14 +vpaddq %xmm7, %xmm11, %xmm11 +vpaddq %xmm3, %xmm6, %xmm6 +vpaddq %xmm2, %xmm6, %xmm2 +vpsrlq $26, %xmm9, %xmm12 +vpsrlq $26, %xmm11, %xmm5 +vpand %xmm0, %xmm9, %xmm9 +vpand %xmm0, %xmm11, %xmm11 +vpaddq %xmm12, %xmm10, %xmm10 +vpaddq %xmm5, %xmm2, %xmm2 +vpsrlq $26, %xmm10, %xmm3 +vpsrlq $26, %xmm2, %xmm7 +vpand %xmm0, %xmm10, %xmm10 +vpand %xmm0, %xmm2, %xmm2 +vpaddq %xmm3, %xmm14, %xmm3 +vpmuludq 184(%rsp), %xmm7, %xmm7 +vpaddq %xmm7, %xmm9, %xmm9 +vpsrlq $26, %xmm3, %xmm6 +vpsrlq $26, %xmm9, %xmm12 +vpand %xmm0, %xmm3, %xmm3 +vpand %xmm0, %xmm9, %xmm7 +vpaddq %xmm6, %xmm11, %xmm6 +vpaddq %xmm12, %xmm10, %xmm12 +vpsrlq $26, %xmm6, %xmm8 +vpand %xmm0, %xmm6, %xmm6 +vpaddq %xmm8, %xmm2, %xmm2 +subq $64, %rcx +addq $64, %rax +cmpq $63, %rcx +ja .L22 +vmovdqa 168(%rsp), %xmm1 +leaq -64(%rdx), %rax +andq $-64, %rax +leaq 64(%rsi,%rax), %rsi +andl $63, %edx +.L20: +cmpq $31, %rdx +jbe .L23 +vpmuludq 120(%rsp), %xmm2, %xmm11 +vmovdqa 88(%rsp), %xmm4 +vpmuludq %xmm4, %xmm6, %xmm0 +vpmuludq %xmm4, %xmm2, %xmm10 +vmovdqa 56(%rsp), %xmm4 +vpmuludq %xmm4, %xmm6, %xmm8 +vpmuludq %xmm4, %xmm2, %xmm5 +vpaddq %xmm0, %xmm11, %xmm11 +vpmuludq %xmm4, %xmm3, %xmm0 +vmovdqa 24(%rsp), %xmm13 +vpmuludq %xmm13, %xmm2, %xmm4 +vpaddq %xmm8, %xmm10, %xmm10 +vpmuludq %xmm13, %xmm12, %xmm8 +vpmuludq %xmm13, %xmm3, %xmm9 +vpaddq %xmm0, %xmm11, %xmm11 +vpmuludq %xmm13, %xmm6, %xmm13 +vpmuludq %xmm1, %xmm6, %xmm0 +vpaddq %xmm8, %xmm11, %xmm8 +vpmuludq %xmm1, %xmm2, %xmm2 +vpaddq %xmm9, %xmm10, %xmm9 +vpmuludq %xmm1, %xmm7, %xmm11 +vpaddq %xmm13, %xmm5, %xmm5 +vpmuludq %xmm1, %xmm12, %xmm10 +vpaddq %xmm0, %xmm4, %xmm0 +vpmuludq %xmm1, %xmm3, %xmm1 +vmovdqa 136(%rsp), %xmm4 +vpmuludq %xmm4, %xmm3, %xmm14 +vpaddq %xmm11, %xmm8, %xmm11 +vpmuludq %xmm4, %xmm6, %xmm6 +vpaddq %xmm10, %xmm9, %xmm9 +vpmuludq %xmm4, %xmm7, %xmm15 +vpaddq %xmm1, %xmm5, %xmm5 +vpmuludq %xmm4, %xmm12, %xmm1 +vpaddq %xmm14, %xmm0, %xmm0 +vmovdqa 104(%rsp), %xmm4 +vpmuludq %xmm4, %xmm12, %xmm8 +vpaddq %xmm6, %xmm2, %xmm2 +vpmuludq %xmm4, %xmm3, %xmm3 +vpaddq %xmm15, %xmm9, %xmm9 +vpmuludq %xmm4, %xmm7, %xmm10 +vpaddq %xmm1, %xmm5, %xmm1 +vmovdqa 72(%rsp), %xmm4 +vpmuludq %xmm4, %xmm7, %xmm15 +vpaddq %xmm8, %xmm0, %xmm0 +vpmuludq %xmm4, %xmm12, %xmm12 +vpaddq %xmm3, %xmm2, %xmm2 +vpmuludq 40(%rsp), %xmm7, %xmm7 +vpaddq %xmm10, %xmm1, %xmm1 +vpaddq %xmm15, %xmm0, %xmm0 +vpaddq %xmm12, %xmm2, %xmm2 +vpaddq %xmm7, %xmm2, %xmm2 +movl $((1 << 26) - 1), %r8d +testq %rsi, %rsi +vmovd %r8d, %xmm15 +je .L24 +vmovdqu (%rsi), %xmm4 +vmovdqu 16(%rsi), %xmm3 +vpunpckldq %xmm3, %xmm4, %xmm5 +vpunpckhdq %xmm3, %xmm4, %xmm3 +vpxor %xmm4, %xmm4, %xmm4 +vpunpckldq %xmm4, %xmm5, %xmm7 +vpunpckhdq %xmm4, %xmm5, %xmm5 +vpunpckldq %xmm4, %xmm3, %xmm6 +vpunpckhdq %xmm4, %xmm3, %xmm3 +vpsllq $6, %xmm5, %xmm5 +vpsllq $12, %xmm6, %xmm6 +vpsllq $18, %xmm3, %xmm3 +vpaddq %xmm7, %xmm11, %xmm11 +vpaddq %xmm5, %xmm9, %xmm9 +vpaddq %xmm6, %xmm1, %xmm1 +vpaddq %xmm3, %xmm0, %xmm0 +vpaddq 152(%rsp), %xmm2, %xmm2 +.L24: +vpshufd $68, %xmm15, %xmm15 +vpsrlq $26, %xmm11, %xmm12 +vpsrlq $26, %xmm0, %xmm3 +vpand %xmm15, %xmm11, %xmm11 +vpand %xmm15, %xmm0, %xmm6 +vpaddq %xmm12, %xmm9, %xmm9 +vpaddq %xmm3, %xmm2, %xmm2 +vpsrlq $26, %xmm9, %xmm3 +vpsrlq $26, %xmm2, %xmm7 +vpand %xmm15, %xmm9, %xmm9 +vpand %xmm15, %xmm2, %xmm2 +vpaddq %xmm3, %xmm1, %xmm3 +vpmuludq 184(%rsp), %xmm7, %xmm7 +vpaddq %xmm7, %xmm11, %xmm7 +vpsrlq $26, %xmm3, %xmm4 +vpsrlq $26, %xmm7, %xmm1 +vpand %xmm15, %xmm3, %xmm3 +vpand %xmm15, %xmm7, %xmm7 +vpaddq %xmm4, %xmm6, %xmm6 +vpaddq %xmm1, %xmm9, %xmm12 +vpsrlq $26, %xmm6, %xmm0 +vpand %xmm15, %xmm6, %xmm6 +vpaddq %xmm0, %xmm2, %xmm2 +.L23: +testq %rsi, %rsi +je .L25 +vpshufd $8, %xmm7, %xmm7 +vpshufd $8, %xmm12, %xmm12 +vpshufd $8, %xmm3, %xmm3 +vpshufd $8, %xmm6, %xmm6 +vpshufd $8, %xmm2, %xmm2 +vpunpcklqdq %xmm12, %xmm7, %xmm7 +vpunpcklqdq %xmm6, %xmm3, %xmm3 +vmovdqu %xmm7, (%rdi) +vmovdqu %xmm3, 16(%rdi) +vmovq %xmm2, 32(%rdi) +jmp .L11 +.L25: +vpsrldq $8, %xmm7, %xmm0 +vpaddq %xmm0, %xmm7, %xmm7 +vpsrldq $8, %xmm12, %xmm0 +vpaddq %xmm0, %xmm12, %xmm12 +vpsrldq $8, %xmm3, %xmm0 +vpaddq %xmm0, %xmm3, %xmm3 +vpsrldq $8, %xmm6, %xmm0 +vpaddq %xmm0, %xmm6, %xmm6 +vpsrldq $8, %xmm2, %xmm0 +vpaddq %xmm0, %xmm2, %xmm2 +vmovd %xmm7, %eax +vmovd %xmm12, %edx +movl %eax, %r9d +shrl $26, %r9d +addl %edx, %r9d +movl %r9d, %r8d +andl $67108863, %r8d +vmovd %xmm3, %edx +shrl $26, %r9d +addl %edx, %r9d +vmovd %xmm6, %edx +movl %r9d, %ecx +shrl $26, %ecx +addl %edx, %ecx +movl %ecx, %esi +andl $67108863, %esi +vmovd %xmm2, %r10d +movl %r8d, %r11d +salq $26, %r11 +andl $67108863, %eax +orq %rax, %r11 +movabsq $17592186044415, %rax +andq %rax, %r11 +andl $67108863, %r9d +salq $8, %r9 +shrl $18, %r8d +movl %r8d, %r8d +orq %r8, %r9 +movq %rsi, %rdx +salq $34, %rdx +orq %rdx, %r9 +andq %rax, %r9 +shrl $26, %ecx +addl %r10d, %ecx +salq $16, %rcx +shrl $10, %esi +movl %esi, %esi +orq %rsi, %rcx +movabsq $4398046511103, %r10 +movq %rcx, %r8 +andq %r10, %r8 +shrq $42, %rcx +leaq (%rcx,%rcx,4), %rdx +addq %r11, %rdx +movq %rdx, %rsi +andq %rax, %rsi +shrq $44, %rdx +addq %r9, %rdx +movq %rdx, %rcx +andq %rax, %rcx +shrq $44, %rdx +addq %r8, %rdx +andq %rdx, %r10 +shrq $42, %rdx +leaq (%rsi,%rdx,4), %rsi +leaq (%rsi,%rdx), %r11 +movq %r11, %rbx +andq %rax, %rbx +shrq $44, %r11 +addq %rcx, %r11 +leaq 5(%rbx), %r9 +movq %r9, %r8 +shrq $44, %r8 +addq %r11, %r8 +movabsq $-4398046511104, %rsi +addq %r10, %rsi +movq %r8, %rdx +shrq $44, %rdx +addq %rdx, %rsi +movq %rsi, %rdx +shrq $63, %rdx +subq $1, %rdx +movq %rdx, %rcx +notq %rcx +andq %rcx, %rbx +andq %rcx, %r11 +andq %r10, %rcx +andq %rax, %r9 +andq %rdx, %r9 +orq %r9, %rbx +movq %rbx, (%rdi) +andq %r8, %rax +andq %rdx, %rax +orq %rax, %r11 +movq %r11, 8(%rdi) +andq %rsi, %rdx +orq %rcx, %rdx +movq %rdx, 16(%rdi) +.L11: +movq -8(%rbp), %rbx +leave +ret +FN_END poly1305_blocks_avx + +GLOBAL_HIDDEN_FN poly1305_finish_ext_avx +poly1305_finish_ext_avx_local: +pushq %r12 +pushq %rbp +pushq %rbx +subq $32, %rsp +movq %rdi, %rbx +movq %rdx, %rbp +movq %rcx, %r12 +testq %rdx, %rdx +je .L30 +movq $0, (%rsp) +movq $0, 8(%rsp) +movq $0, 16(%rsp) +movq $0, 24(%rsp) +movq %rsp, %rax +subq %rsp, %rsi +testb $16, %dl +je .L31 +vmovdqu (%rsp,%rsi), %xmm0 +vmovdqa %xmm0, (%rsp) +addq $16, %rax +.L31: +testb $8, %bpl +je .L32 +movq (%rax,%rsi), %rdx +movq %rdx, (%rax) +addq $8, %rax +.L32: +testb $4, %bpl +je .L33 +movl (%rax,%rsi), %edx +movl %edx, (%rax) +addq $4, %rax +.L33: +testb $2, %bpl +je .L34 +movzwl (%rax,%rsi), %edx +movw %dx, (%rax) +addq $2, %rax +.L34: +testb $1, %bpl +je .L35 +movzbl (%rax,%rsi), %edx +movb %dl, (%rax) +.L35: +cmpq $16, %rbp +je .L36 +movb $1, (%rsp,%rbp) +movq 120(%rbx), %rdx +cmpq $16, %rbp +sbbq %rax, %rax +andl $4, %eax +addq $4, %rax +.L37: +orq %rdx, %rax +movq %rax, 120(%rbx) +movq %rsp, %rsi +movl $32, %edx +movq %rbx, %rdi +call poly1305_blocks_avx_local +.L30: +movq 120(%rbx), %rax +testb $1, %al +je .L38 +subq $1, %rbp +cmpq $15, %rbp +jbe .L39 +orq $16, %rax +movq %rax, 120(%rbx) +jmp .L40 +.L39: +orq $32, %rax +movq %rax, 120(%rbx) +.L40: +movl $32, %edx +movl $0, %esi +movq %rbx, %rdi +call poly1305_blocks_avx_local +.L38: +movq 8(%rbx), %rax +movq %rax, %rdx +salq $44, %rdx +orq (%rbx), %rdx +shrq $20, %rax +movq 16(%rbx), %rcx +salq $24, %rcx +orq %rcx, %rax +movq 104(%rbx), %rcx +movq 112(%rbx), %rsi +addq %rcx, %rdx +adcq %rsi, %rax +vpxor %xmm0, %xmm0, %xmm0 +vmovdqu %xmm0, (%rbx) +vmovdqu %xmm0, 16(%rbx) +vmovdqu %xmm0, 32(%rbx) +vmovdqu %xmm0, 48(%rbx) +vmovdqu %xmm0, 64(%rbx) +vmovdqu %xmm0, 80(%rbx) +vmovdqu %xmm0, 96(%rbx) +vmovdqu %xmm0, 112(%rbx) +movq %rdx, (%r12) +movq %rax, 8(%r12) +jmp .L43 +.L36: +movq 120(%rbx), %rdx +movl $4, %eax +jmp .L37 +.L43: +addq $32, %rsp +popq %rbx +popq %rbp +popq %r12 +ret +FN_END poly1305_finish_ext_avx + +GLOBAL_HIDDEN_FN poly1305_auth_avx +cmp $128, %rdx +jb poly1305_auth_x86_local +pushq %rbp +movq %rsp, %rbp +pushq %r14 +pushq %r13 +pushq %r12 +pushq %rbx +andq $-64, %rsp +addq $-128, %rsp +movq %rdi, %r14 +movq %rsi, %r12 +movq %rdx, %rbx +movq %rsp, %rdi +movq %rcx, %rsi +call poly1305_init_ext_avx_local +movq %rbx, %r13 +andq $-32, %r13 +je .L46 +movq %rsp, %rdi +movq %r13, %rdx +movq %r12, %rsi +call poly1305_blocks_avx_local +addq %r13, %r12 +subq %r13, %rbx +.L46: +movq %rsp, %rdi +movq %r14, %rcx +movq %rbx, %rdx +movq %r12, %rsi +call poly1305_finish_ext_avx_local +leaq -32(%rbp), %rsp +popq %rbx +popq %r12 +popq %r13 +popq %r14 +popq %rbp +ret +FN_END poly1305_auth_avx diff --git a/src/libcryptobox/poly1305/avx2.S b/src/libcryptobox/poly1305/avx2.S new file mode 100644 index 000000000..068e24d3d --- /dev/null +++ b/src/libcryptobox/poly1305/avx2.S @@ -0,0 +1,1093 @@ +#include "../chacha20/macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0 +movl $64, %eax +ret +FN_END poly1305_block_size_avx2 + +GLOBAL_HIDDEN_FN poly1305_auth_avx2 +cmp $128, %rdx +jb poly1305_auth_x86_local +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +pushq %r12 +pushq %r14 +pushq %r15 +pushq %rbx +subq $224, %rsp +movq %rsi, %r14 +movq %rdi, %rbx +lea (%rsp), %rdi +movq %rcx, %rsi +movq %rdx, %r12 +call poly1305_init_ext_avx2_local +poly1305_auth_avx2_2: +movq %r12, %r15 +andq $-64, %r15 +je poly1305_auth_avx2_5 +poly1305_auth_avx2_3: +movq %r14, %rsi +lea (%rsp), %rdi +movq %r15, %rdx +call poly1305_blocks_avx2_local +poly1305_auth_avx2_4: +addq %r15, %r14 +subq %r15, %r12 +poly1305_auth_avx2_5: +movq %r14, %rsi +lea (%rsp), %rdi +movq %r12, %rdx +movq %rbx, %rcx +call poly1305_finish_ext_avx2_local +poly1305_auth_avx2_6: +addq $224, %rsp +popq %rbx +popq %r15 +popq %r14 +popq %r12 +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_auth_avx2 + + +GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2 +poly1305_finish_ext_avx2_local: +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +pushq %r12 +pushq %r13 +pushq %r14 +subq $104, %rsp +movq %rdx, %r13 +movq %rcx, %r14 +movq %rdi, %r12 +testq %r13, %r13 +je poly1305_finish_ext_avx2_29 +poly1305_finish_ext_avx2_2: +lea (%rsp), %rax +vpxor %ymm0, %ymm0, %ymm0 +subq %rax, %rsi +vmovdqu %ymm0, (%rsp) +vmovdqu %ymm0, 32(%rsp) +testq $32, %r13 +je poly1305_finish_ext_avx2_4 +poly1305_finish_ext_avx2_3: +vmovdqu (%rsp,%rsi), %ymm0 +lea 32(%rsp), %rax +vmovdqu %ymm0, (%rsp) +poly1305_finish_ext_avx2_4: +testq $16, %r13 +je poly1305_finish_ext_avx2_6 +poly1305_finish_ext_avx2_5: +vmovdqu (%rax,%rsi), %xmm0 +vmovdqu %xmm0, (%rax) +addq $16, %rax +poly1305_finish_ext_avx2_6: +testq $8, %r13 +je poly1305_finish_ext_avx2_8 +poly1305_finish_ext_avx2_7: +movq (%rax,%rsi), %rdx +movq %rdx, (%rax) +addq $8, %rax +poly1305_finish_ext_avx2_8: +testq $4, %r13 +je poly1305_finish_ext_avx2_10 +poly1305_finish_ext_avx2_9: +movl (%rax,%rsi), %edx +movl %edx, (%rax) +addq $4, %rax +poly1305_finish_ext_avx2_10: +testq $2, %r13 +je poly1305_finish_ext_avx2_12 +poly1305_finish_ext_avx2_11: +movzwl (%rax,%rsi), %edx +movw %dx, (%rax) +addq $2, %rax +poly1305_finish_ext_avx2_12: +testq $1, %r13 +je poly1305_finish_ext_avx2_14 +poly1305_finish_ext_avx2_13: +movb (%rax,%rsi), %dl +movb %dl, (%rax) +poly1305_finish_ext_avx2_14: +testq $15, %r13 +je poly1305_finish_ext_avx2_16 +poly1305_finish_ext_avx2_15: +movb $1, (%rsp,%r13) +poly1305_finish_ext_avx2_16: +movq 176(%r12), %rdx +andq $-8125, %rdx +cmpq $48, %r13 +jb poly1305_finish_ext_avx2_18 +poly1305_finish_ext_avx2_17: +orq $4, %rdx +jmp poly1305_finish_ext_avx2_21 +poly1305_finish_ext_avx2_18: +cmpq $32, %r13 +jb poly1305_finish_ext_avx2_20 +poly1305_finish_ext_avx2_19: +orq $8, %rdx +jmp poly1305_finish_ext_avx2_21 +poly1305_finish_ext_avx2_20: +movq %rdx, %rax +orq $32, %rdx +orq $16, %rax +cmpq $16, %r13 +cmovae %rax, %rdx +poly1305_finish_ext_avx2_21: +testq $1, %rdx +je poly1305_finish_ext_avx2_27 +poly1305_finish_ext_avx2_22: +cmpq $16, %r13 +ja poly1305_finish_ext_avx2_24 +poly1305_finish_ext_avx2_23: +orq $256, %rdx +movq %rdx, 176(%r12) +jmp poly1305_finish_ext_avx2_28 +poly1305_finish_ext_avx2_24: +cmpq $32, %r13 +ja poly1305_finish_ext_avx2_27 +poly1305_finish_ext_avx2_25: +orq $128, %rdx +movq %rdx, 176(%r12) +jmp poly1305_finish_ext_avx2_28 +poly1305_finish_ext_avx2_27: +movq %rdx, 176(%r12) +poly1305_finish_ext_avx2_28: +movq %r12, %rdi +lea (%rsp), %rsi +movl $64, %edx +vzeroupper +call poly1305_blocks_avx2_local +poly1305_finish_ext_avx2_29: +movq 176(%r12), %rdx +testq $1, %rdx +je poly1305_finish_ext_avx2_37 +poly1305_finish_ext_avx2_30: +andq $-8125, %rdx +testq %r13, %r13 +je poly1305_finish_ext_avx2_32 +poly1305_finish_ext_avx2_31: +cmpq $48, %r13 +jbe poly1305_finish_ext_avx2_33 +poly1305_finish_ext_avx2_32: +orq $512, %rdx +jmp poly1305_finish_ext_avx2_36 +poly1305_finish_ext_avx2_33: +cmpq $32, %r13 +jbe poly1305_finish_ext_avx2_35 +poly1305_finish_ext_avx2_34: +orq $1024, %rdx +jmp poly1305_finish_ext_avx2_36 +poly1305_finish_ext_avx2_35: +movq %rdx, %rax +orq $4096, %rdx +orq $2048, %rax +cmpq $16, %r13 +cmova %rax, %rdx +poly1305_finish_ext_avx2_36: +orq $96, %rdx +movq %r12, %rdi +vpxor %ymm0, %ymm0, %ymm0 +lea (%rsp), %rsi +movq %rdx, 176(%r12) +movl $64, %edx +vmovdqu %ymm0, (%rsp) +vmovdqu %ymm0, 32(%rsp) +vzeroupper +call poly1305_blocks_avx2_local +poly1305_finish_ext_avx2_37: +movq 8(%r12), %r8 +movq %r8, %rsi +movq 16(%r12), %rax +vpxor %ymm0, %ymm0, %ymm0 +shlq $44, %rsi +shrq $20, %r8 +shlq $24, %rax +orq (%r12), %rsi +orq %rax, %r8 +movq 160(%r12), %rdx +movq 168(%r12), %rcx +addq %rdx, %rsi +adcq %rcx, %r8 +vmovdqu %ymm0, (%r12) +vmovdqu %ymm0, 32(%r12) +vmovdqu %ymm0, 64(%r12) +vmovdqu %ymm0, 96(%r12) +vmovdqu %ymm0, 128(%r12) +vmovdqu %ymm0, 160(%r12) +movq %rsi, (%r14) +movq %r8, 8(%r14) +vzeroupper +addq $104, %rsp +popq %r14 +popq %r13 +popq %r12 +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_finish_ext_avx2 + +GLOBAL_HIDDEN_FN poly1305_blocks_avx2 +poly1305_blocks_avx2_local: +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +subq $384, %rsp +movl $16777216, %eax +movl $67108863, %ecx +movl $5, %r8d +vmovd %eax, %xmm1 +vmovd %ecx, %xmm10 +vmovd %r8d, %xmm0 +movq 176(%rdi), %rax +vpbroadcastq %xmm1, %ymm1 +vpbroadcastq %xmm10, %ymm10 +vpbroadcastq %xmm0, %ymm11 +testq $60, %rax +je poly1305_blocks_avx2_11 +poly1305_blocks_avx2_2: +vpsrldq $8, %ymm1, %ymm15 +testq $4, %rax +je poly1305_blocks_avx2_4 +poly1305_blocks_avx2_3: +vpermq $192, %ymm15, %ymm15 +poly1305_blocks_avx2_4: +testq $8, %rax +je poly1305_blocks_avx2_6 +poly1305_blocks_avx2_5: +vpermq $240, %ymm15, %ymm15 +poly1305_blocks_avx2_6: +testq $16, %rax +je poly1305_blocks_avx2_8 +poly1305_blocks_avx2_7: +vpermq $252, %ymm15, %ymm15 +poly1305_blocks_avx2_8: +testq $32, %rax +je poly1305_blocks_avx2_10 +poly1305_blocks_avx2_9: +vpxor %ymm15, %ymm15, %ymm15 +poly1305_blocks_avx2_10: +vmovdqa %ymm15, %ymm1 +poly1305_blocks_avx2_11: +movq %rax, %rcx +btsq $0, %rcx +jc poly1305_blocks_avx2_13 +poly1305_blocks_avx2_12: +vmovdqu (%rsi), %ymm3 +movq %rcx, %rax +vmovdqu 32(%rsi), %ymm5 +vpunpcklqdq %ymm5, %ymm3, %ymm4 +addq $64, %rsi +vpunpckhqdq %ymm5, %ymm3, %ymm7 +vpermq $216, %ymm4, %ymm6 +addq $-64, %rdx +vpermq $216, %ymm7, %ymm0 +vpsrlq $52, %ymm6, %ymm8 +vpsllq $12, %ymm0, %ymm9 +vpsrlq $26, %ymm6, %ymm2 +vpsrlq $40, %ymm0, %ymm0 +vpand %ymm6, %ymm10, %ymm4 +vpor %ymm9, %ymm8, %ymm7 +vpand %ymm2, %ymm10, %ymm3 +vpor %ymm1, %ymm0, %ymm9 +vpsrlq $26, %ymm7, %ymm2 +vpand %ymm7, %ymm10, %ymm5 +vpand %ymm2, %ymm10, %ymm7 +movq %rax, 176(%rdi) +jmp poly1305_blocks_avx2_14 +poly1305_blocks_avx2_13: +vpermq $216, (%rdi), %ymm15 +vpxor %ymm0, %ymm0, %ymm0 +vpermq $216, 32(%rdi), %ymm14 +vpermq $216, 64(%rdi), %ymm13 +vpunpckldq %ymm0, %ymm15, %ymm4 +vpunpckhdq %ymm0, %ymm15, %ymm3 +vpunpckldq %ymm0, %ymm14, %ymm5 +vpunpckhdq %ymm0, %ymm14, %ymm7 +vpunpckldq %ymm0, %ymm13, %ymm9 +poly1305_blocks_avx2_14: +cmpq $64, %rdx +jb poly1305_blocks_avx2_34 +poly1305_blocks_avx2_15: +vmovdqu 140(%rdi), %ymm0 +testq $8064, %rax +je poly1305_blocks_avx2_29 +poly1305_blocks_avx2_16: +vpermq $216, 80(%rdi), %ymm6 +vpermq $216, 100(%rdi), %ymm2 +vpermq $216, 120(%rdi), %ymm8 +vpermq $216, %ymm0, %ymm0 +testq $128, %rax +je poly1305_blocks_avx2_18 +poly1305_blocks_avx2_17: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm0, %ymm14 +vmovdqa %ymm0, %ymm13 +vmovdqa %ymm8, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_18: +testq $256, %rax +je poly1305_blocks_avx2_20 +poly1305_blocks_avx2_19: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm0, %ymm14 +vmovdqa %ymm8, %ymm13 +vmovdqa %ymm2, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_20: +testq $512, %rax +je poly1305_blocks_avx2_22 +poly1305_blocks_avx2_21: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm8, %ymm14 +vmovdqa %ymm2, %ymm13 +vmovdqa %ymm6, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_22: +testq $1024, %rax +je poly1305_blocks_avx2_24 +poly1305_blocks_avx2_23: +vpxor %ymm12, %ymm12, %ymm12 +movl $1, %r8d +vmovdqa %ymm8, %ymm15 +vmovdqa %ymm2, %ymm14 +vmovdqa %ymm6, %ymm13 +vmovd %r8d, %xmm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_24: +testq $2048, %rax +je poly1305_blocks_avx2_26 +poly1305_blocks_avx2_25: +vpxor %ymm12, %ymm12, %ymm12 +movl $1, %r8d +vmovd %r8d, %xmm13 +vmovdqa %ymm2, %ymm15 +vmovdqa %ymm6, %ymm14 +vmovdqa %ymm13, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_26: +testq $4096, %rax +je poly1305_blocks_avx2_28 +poly1305_blocks_avx2_27: +movl $1, %r8d +vmovd %r8d, %xmm14 +vmovdqa %ymm6, %ymm15 +vmovdqa %ymm14, %ymm13 +vmovdqa %ymm14, %ymm12 +poly1305_blocks_avx2_28: +vpunpcklqdq %ymm14, %ymm15, %ymm6 +vpunpcklqdq %ymm12, %ymm13, %ymm8 +vpunpckhqdq %ymm14, %ymm15, %ymm14 +vpunpckhqdq %ymm12, %ymm13, %ymm12 +vperm2i128 $32, %ymm8, %ymm6, %ymm2 +vperm2i128 $49, %ymm8, %ymm6, %ymm6 +vpsrlq $32, %ymm6, %ymm0 +vpsrlq $32, %ymm2, %ymm8 +vmovdqu %ymm0, 352(%rsp) +vperm2i128 $32, %ymm12, %ymm14, %ymm13 +vmovdqu %ymm13, 320(%rsp) +jmp poly1305_blocks_avx2_30 +poly1305_blocks_avx2_29: +vpsrlq $32, %ymm0, %ymm12 +vpermq $0, %ymm0, %ymm2 +vpermq $85, %ymm0, %ymm6 +vpermq $85, %ymm12, %ymm13 +vpermq $170, %ymm0, %ymm0 +vpermq $0, %ymm12, %ymm8 +vmovdqu %ymm13, 352(%rsp) +vmovdqu %ymm0, 320(%rsp) +poly1305_blocks_avx2_30: +vmovdqu (%rsi), %ymm12 +movq %rdx, %r9 +vmovdqu 352(%rsp), %ymm15 +vmovdqu %ymm1, 160(%rsp) +vmovdqu %ymm10, 192(%rsp) +vmovdqu %ymm11, 128(%rsp) +vperm2i128 $32, 32(%rsi), %ymm12, %ymm13 +xorl %r8d, %r8d +vperm2i128 $49, 32(%rsi), %ymm12, %ymm12 +xorl %ecx, %ecx +vpmuludq %ymm11, %ymm8, %ymm0 +vpmuludq %ymm11, %ymm6, %ymm1 +vmovdqu %ymm0, 224(%rsp) +vmovdqu %ymm1, 256(%rsp) +vpunpckldq %ymm12, %ymm13, %ymm14 +vpunpckhdq %ymm12, %ymm13, %ymm12 +vmovdqu %ymm14, 32(%rsp) +vpmuludq %ymm0, %ymm9, %ymm0 +vpmuludq %ymm1, %ymm7, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vpmuludq %ymm11, %ymm15, %ymm10 +vpmuludq %ymm10, %ymm5, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vmovdqu %ymm10, 288(%rsp) +vpmuludq 320(%rsp), %ymm11, %ymm11 +vpmuludq %ymm11, %ymm3, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vmovdqu %ymm11, (%rsp) +vpmuludq %ymm2, %ymm4, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vpxor %ymm13, %ymm13, %ymm13 +vpunpckldq %ymm13, %ymm14, %ymm14 +vpaddq %ymm14, %ymm0, %ymm0 +vmovdqu %ymm0, 64(%rsp) +vpmuludq %ymm11, %ymm9, %ymm14 +vpmuludq %ymm2, %ymm7, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm8, %ymm5, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm6, %ymm3, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm15, %ymm4, %ymm0 +vpaddq %ymm0, %ymm14, %ymm0 +vpunpckhdq %ymm13, %ymm12, %ymm14 +vpsllq $18, %ymm14, %ymm14 +vpaddq %ymm14, %ymm0, %ymm14 +vpmuludq %ymm1, %ymm9, %ymm1 +vpmuludq %ymm10, %ymm7, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm11, %ymm5, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm2, %ymm3, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm8, %ymm4, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu 32(%rsp), %ymm0 +vpunpckhdq %ymm13, %ymm0, %ymm0 +vpsllq $6, %ymm0, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu 64(%rsp), %ymm0 +vpsrlq $26, %ymm0, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu %ymm1, 96(%rsp) +vpmuludq %ymm2, %ymm9, %ymm1 +vpmuludq %ymm8, %ymm7, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm10, %ymm9, %ymm10 +vpmuludq %ymm11, %ymm7, %ymm11 +vpaddq %ymm11, %ymm10, %ymm7 +vpmuludq %ymm6, %ymm5, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm2, %ymm5, %ymm5 +vpaddq %ymm5, %ymm7, %ymm10 +vpmuludq %ymm15, %ymm3, %ymm15 +vpaddq %ymm15, %ymm1, %ymm1 +vpmuludq %ymm8, %ymm3, %ymm11 +vpaddq %ymm11, %ymm10, %ymm5 +vpunpckldq %ymm13, %ymm12, %ymm10 +vmovdqu 96(%rsp), %ymm12 +vpmuludq 320(%rsp), %ymm4, %ymm0 +vpaddq %ymm0, %ymm1, %ymm15 +vpsrlq $26, %ymm12, %ymm3 +vmovdqu 160(%rsp), %ymm1 +vpmuludq %ymm6, %ymm4, %ymm4 +vpaddq %ymm1, %ymm15, %ymm0 +vpsrlq $26, %ymm14, %ymm15 +vpaddq %ymm4, %ymm5, %ymm11 +vpsllq $12, %ymm10, %ymm4 +vmovdqu 192(%rsp), %ymm10 +vpaddq %ymm15, %ymm0, %ymm0 +vpaddq %ymm4, %ymm11, %ymm5 +vmovdqu 128(%rsp), %ymm11 +vpsrlq $26, %ymm0, %ymm9 +vpaddq %ymm3, %ymm5, %ymm7 +vpand 64(%rsp), %ymm10, %ymm13 +vpand %ymm10, %ymm12, %ymm12 +vpand %ymm10, %ymm7, %ymm5 +vpsrlq $26, %ymm7, %ymm7 +vpmuludq %ymm11, %ymm9, %ymm15 +vpand %ymm10, %ymm14, %ymm9 +vpaddq %ymm15, %ymm13, %ymm3 +vpand %ymm10, %ymm0, %ymm14 +vpaddq %ymm7, %ymm9, %ymm9 +vpand %ymm10, %ymm3, %ymm4 +vpsrlq $26, %ymm3, %ymm3 +vpsrlq $26, %ymm9, %ymm0 +vpand %ymm10, %ymm9, %ymm7 +vpaddq %ymm3, %ymm12, %ymm3 +vpaddq %ymm0, %ymm14, %ymm9 +sarq $5, %r9 +shrq $58, %r9 +addq %rdx, %r9 +sarq $6, %r9 +cmpq $2, %r9 +jl poly1305_blocks_avx2_34 +poly1305_blocks_avx2_31: +vmovdqu %ymm6, 32(%rsp) +lea -64(%rdx), %r9 +vmovdqu %ymm8, 64(%rsp) +vmovdqu %ymm11, 128(%rsp) +vmovdqu %ymm10, 192(%rsp) +vmovdqu %ymm1, 160(%rsp) +vmovdqu (%rsp), %ymm12 +sarq $5, %r9 +shrq $58, %r9 +lea -64(%rdx,%r9), %rdx +sarq $6, %rdx +poly1305_blocks_avx2_32: +vmovdqu 256(%rsp), %ymm15 +incq %r8 +vmovdqu 64(%rcx,%rsi), %ymm11 +vpmuludq 224(%rsp), %ymm9, %ymm8 +vpmuludq %ymm15, %ymm7, %ymm14 +vpaddq %ymm14, %ymm8, %ymm1 +vmovdqu 288(%rsp), %ymm8 +vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10 +vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6 +addq $64, %rcx +vpmuludq %ymm8, %ymm5, %ymm13 +vpunpckldq %ymm6, %ymm10, %ymm0 +vpunpckhdq %ymm6, %ymm10, %ymm11 +vpaddq %ymm13, %ymm1, %ymm10 +vpmuludq %ymm12, %ymm3, %ymm6 +vpaddq %ymm6, %ymm10, %ymm14 +vpxor %ymm10, %ymm10, %ymm10 +vpunpckldq %ymm10, %ymm0, %ymm6 +vpunpckhdq %ymm10, %ymm0, %ymm0 +vpmuludq %ymm2, %ymm4, %ymm1 +vpaddq %ymm1, %ymm14, %ymm13 +vpaddq %ymm6, %ymm13, %ymm1 +vmovdqu 64(%rsp), %ymm6 +vmovdqu %ymm1, (%rsp) +vpsrlq $26, %ymm1, %ymm1 +vpmuludq %ymm12, %ymm9, %ymm14 +vpmuludq %ymm2, %ymm7, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq %ymm6, %ymm5, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq 32(%rsp), %ymm3, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq 352(%rsp), %ymm4, %ymm13 +vpaddq %ymm13, %ymm14, %ymm13 +vpunpckhdq %ymm10, %ymm11, %ymm14 +vpsllq $18, %ymm14, %ymm14 +vpaddq %ymm14, %ymm13, %ymm13 +vpmuludq %ymm15, %ymm9, %ymm15 +vpmuludq %ymm8, %ymm7, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm12, %ymm5, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm2, %ymm3, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm6, %ymm4, %ymm14 +vpaddq %ymm14, %ymm15, %ymm14 +vpsllq $6, %ymm0, %ymm15 +vpaddq %ymm15, %ymm14, %ymm14 +vmovdqu 32(%rsp), %ymm15 +vpaddq %ymm1, %ymm14, %ymm1 +vpmuludq %ymm2, %ymm9, %ymm0 +vpmuludq %ymm6, %ymm7, %ymm14 +vpmuludq %ymm8, %ymm9, %ymm9 +vpmuludq %ymm12, %ymm7, %ymm7 +vpaddq %ymm7, %ymm9, %ymm7 +vpaddq %ymm14, %ymm0, %ymm0 +vpsrlq $26, %ymm1, %ymm9 +vpmuludq %ymm15, %ymm5, %ymm14 +vpmuludq %ymm2, %ymm5, %ymm5 +vpaddq %ymm5, %ymm7, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpmuludq 352(%rsp), %ymm3, %ymm14 +vpmuludq %ymm6, %ymm3, %ymm3 +vpaddq %ymm3, %ymm5, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpmuludq 320(%rsp), %ymm4, %ymm14 +vpmuludq %ymm15, %ymm4, %ymm4 +vpaddq %ymm4, %ymm5, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpunpckldq %ymm10, %ymm11, %ymm4 +vpaddq 160(%rsp), %ymm0, %ymm14 +vpsrlq $26, %ymm13, %ymm0 +vpsllq $12, %ymm4, %ymm3 +vpaddq %ymm0, %ymm14, %ymm14 +vpaddq %ymm3, %ymm5, %ymm7 +vpsrlq $26, %ymm14, %ymm0 +vpaddq %ymm9, %ymm7, %ymm10 +vmovdqu 192(%rsp), %ymm9 +vpsrlq $26, %ymm10, %ymm11 +vpand (%rsp), %ymm9, %ymm6 +vpand %ymm9, %ymm13, %ymm13 +vpand %ymm9, %ymm1, %ymm1 +vpand %ymm9, %ymm14, %ymm14 +vpand %ymm9, %ymm10, %ymm5 +vpmuludq 128(%rsp), %ymm0, %ymm8 +vpaddq %ymm8, %ymm6, %ymm15 +vpaddq %ymm11, %ymm13, %ymm0 +vpsrlq $26, %ymm15, %ymm3 +vpand %ymm9, %ymm0, %ymm7 +vpsrlq $26, %ymm0, %ymm0 +vpand %ymm9, %ymm15, %ymm4 +vpaddq %ymm3, %ymm1, %ymm3 +vpaddq %ymm0, %ymm14, %ymm9 +cmpq %rdx, %r8 +jb poly1305_blocks_avx2_32 +poly1305_blocks_avx2_34: +testq $64, %rax +jne poly1305_blocks_avx2_36 +poly1305_blocks_avx2_35: +vpshufd $8, %ymm4, %ymm0 +vpshufd $8, %ymm3, %ymm3 +vpshufd $8, %ymm5, %ymm5 +vpshufd $8, %ymm7, %ymm7 +vpshufd $8, %ymm9, %ymm9 +vpermq $8, %ymm0, %ymm1 +vpermq $8, %ymm3, %ymm2 +vpermq $8, %ymm5, %ymm4 +vpermq $8, %ymm7, %ymm6 +vpermq $8, %ymm9, %ymm11 +vperm2i128 $32, %ymm2, %ymm1, %ymm8 +vperm2i128 $32, %ymm6, %ymm4, %ymm10 +vmovdqu %ymm8, (%rdi) +vmovdqu %ymm10, 32(%rdi) +vmovdqu %xmm11, 64(%rdi) +jmp poly1305_blocks_avx2_37 +poly1305_blocks_avx2_36: +vpermq $245, %ymm4, %ymm0 +vpaddq %ymm0, %ymm4, %ymm4 +vpermq $245, %ymm3, %ymm1 +vpaddq %ymm1, %ymm3, %ymm10 +vpermq $245, %ymm5, %ymm3 +vpermq $170, %ymm4, %ymm6 +vpaddq %ymm3, %ymm5, %ymm13 +vpaddq %ymm6, %ymm4, %ymm8 +vpermq $170, %ymm10, %ymm11 +vpermq $245, %ymm7, %ymm5 +vpaddq %ymm11, %ymm10, %ymm12 +vpaddq %ymm5, %ymm7, %ymm7 +vpermq $170, %ymm13, %ymm14 +vpermq $245, %ymm9, %ymm2 +vpaddq %ymm14, %ymm13, %ymm15 +vpaddq %ymm2, %ymm9, %ymm9 +vpermq $170, %ymm7, %ymm0 +vpaddq %ymm0, %ymm7, %ymm1 +vpermq $170, %ymm9, %ymm2 +vpaddq %ymm2, %ymm9, %ymm3 +vmovd %xmm8, %r9d +movl %r9d, %r8d +shrl $26, %r8d +andq $67108863, %r9 +vmovd %xmm12, %esi +addl %r8d, %esi +movl %esi, %r11d +shrl $26, %esi +andq $67108863, %r11 +vmovd %xmm15, %ecx +addl %esi, %ecx +movl %ecx, %eax +shrl $26, %eax +andq $67108863, %rcx +shlq $8, %rcx +vmovd %xmm1, %r8d +addl %eax, %r8d +movl %r8d, %r10d +shrl $26, %r8d +andq $67108863, %r10 +movq %r10, %rax +shrq $10, %rax +shlq $34, %r10 +vmovd %xmm3, %edx +addl %r8d, %edx +shlq $16, %rdx +orq %rdx, %rax +movq %rax, %r8 +shrq $42, %r8 +lea (%r8,%r8,4), %rdx +movq %r11, %r8 +shlq $26, %r8 +orq %r8, %r9 +movq $0xfffffffffff, %r8 +shrq $18, %r11 +andq %r8, %r9 +addq %r9, %rdx +orq %rcx, %r11 +movq %rdx, %rsi +orq %r10, %r11 +shrq $44, %rsi +andq %r8, %r11 +addq %r11, %rsi +movq $0x3ffffffffff, %r9 +movq %rsi, %r10 +andq %r9, %rax +shrq $44, %r10 +andq %r8, %rdx +addq %r10, %rax +movq %r8, %rcx +andq %rax, %r9 +andq %r8, %rsi +shrq $42, %rax +movq $0xfffffc0000000000, %r10 +lea (%rax,%rax,4), %r11 +addq %r11, %rdx +andq %rdx, %rcx +shrq $44, %rdx +addq %rdx, %rsi +lea 5(%rcx), %rdx +movq %rdx, %r11 +andq %r8, %rdx +shrq $44, %r11 +addq %rsi, %r11 +movq %r11, %rax +andq %r11, %r8 +shrq $44, %rax +addq %r9, %rax +addq %r10, %rax +movq %rax, %r10 +shrq $63, %r10 +decq %r10 +andn %rcx, %r10, %rcx +andq %r10, %rdx +orq %rdx, %rcx +andq %r10, %r8 +andn %rsi, %r10, %rdx +andq %r10, %rax +andn %r9, %r10, %rsi +orq %r8, %rdx +orq %rax, %rsi +movq %rcx, (%rdi) +movq %rdx, 8(%rdi) +movq %rsi, 16(%rdi) +poly1305_blocks_avx2_37: +vzeroupper +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_blocks_avx2 + +GLOBAL_HIDDEN_FN poly1305_init_ext_avx2 +poly1305_init_ext_avx2_local: +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +pushq %rbx +movq %rdi, %r10 +vpxor %ymm0, %ymm0, %ymm0 +movq %rdx, %r12 +vpxor %xmm1, %xmm1, %xmm1 +vmovdqu %xmm1, 64(%r10) +vmovdqu %ymm0, (%r10) +vmovdqu %ymm0, 32(%r10) +movq $-1, %r8 +testq %r12, %r12 +movq 8(%rsi), %rdi +movq $0xffc0fffffff, %r9 +movq %rdi, %rcx +cmove %r8, %r12 +movq (%rsi), %r8 +andq %r8, %r9 +shrq $44, %r8 +movq $0xfffffc0ffff, %r11 +shlq $20, %rcx +shrq $24, %rdi +orq %rcx, %r8 +movq $0xffffffc0f, %rcx +andq %r11, %r8 +andq %rcx, %rdi +movq 16(%rsi), %rcx +movq %rcx, 160(%r10) +movq %r9, %rcx +movq 24(%rsi), %rdx +movq %rdx, 168(%r10) +movl %r9d, %edx +andl $67108863, %edx +movl %edx, 80(%r10) +movq %r8, %rdx +shrq $26, %rcx +shlq $18, %rdx +orq %rdx, %rcx +movq %r8, %rdx +shrq $8, %rdx +andl $67108863, %ecx +andl $67108863, %edx +movl %ecx, 84(%r10) +movq %r8, %rcx +movl %edx, 88(%r10) +movq %rdi, %rdx +shrq $34, %rcx +shlq $10, %rdx +orq %rdx, %rcx +movq %rdi, %rdx +shrq $16, %rdx +andl $67108863, %ecx +movl %ecx, 92(%r10) +movl %edx, 96(%r10) +cmpq $16, %r12 +jbe poly1305_init_ext_avx2_7 +poly1305_init_ext_avx2_2: +movq %r9, %rax +lea (%rdi,%rdi,4), %r14 +mulq %r9 +shlq $2, %r14 +movq %rax, %r11 +movq %rdx, %r15 +lea (%r8,%r8), %rax +mulq %r14 +addq %rax, %r11 +lea (%r9,%r9), %rax +movq %r11, %rsi +adcq %rdx, %r15 +mulq %r8 +movq %rax, %rbx +movq %r14, %rax +movq %rdx, %rcx +lea (%rdi,%rdi), %r14 +mulq %rdi +addq %rax, %rbx +movq %r8, %rax +adcq %rdx, %rcx +mulq %r8 +shlq $20, %r15 +movq %rax, %r13 +shrq $44, %rsi +movq %r9, %rax +orq %rsi, %r15 +movq %rdx, %rsi +mulq %r14 +addq %r15, %rbx +movq %rbx, %r15 +adcq $0, %rcx +addq %rax, %r13 +adcq %rdx, %rsi +shlq $20, %rcx +shrq $44, %r15 +orq %r15, %rcx +addq %rcx, %r13 +movq $0xfffffffffff, %rcx +movq %r13, %rdx +adcq $0, %rsi +andq %rcx, %r11 +shlq $22, %rsi +andq %rcx, %rbx +shrq $42, %rdx +orq %rdx, %rsi +lea (%rsi,%rsi,4), %rsi +addq %rsi, %r11 +movq %rcx, %rsi +andq %r11, %rsi +shrq $44, %r11 +addq %r11, %rbx +movq $0x3ffffffffff, %r11 +andq %rbx, %rcx +andq %r11, %r13 +shrq $44, %rbx +movq %rsi, %r11 +movq %rcx, %rdx +addq %r13, %rbx +shrq $26, %r11 +movq %rbx, %r15 +shlq $18, %rdx +movq %rcx, %r14 +orq %rdx, %r11 +movq %rcx, %rdx +shrq $34, %rdx +movl %esi, %r13d +shlq $10, %r15 +andl $67108863, %r13d +orq %r15, %rdx +andl $67108863, %r11d +shrq $8, %r14 +andl $67108863, %edx +movl %edx, 112(%r10) +movq %rbx, %rdx +shrq $16, %rdx +andl $67108863, %r14d +movl %r13d, 100(%r10) +movl %r11d, 104(%r10) +movl %r14d, 108(%r10) +movl %edx, 116(%r10) +cmpq $48, %r12 +jbe poly1305_init_ext_avx2_4 +poly1305_init_ext_avx2_3: +movq %rsi, %rax +lea (%rbx,%rbx,4), %r15 +mulq %rsi +shlq $2, %r15 +movq %rax, %r13 +movq %rdx, %r12 +lea (%rcx,%rcx), %rax +mulq %r15 +addq %rax, %r13 +lea (%rsi,%rsi), %rax +movq %r15, -16(%rsp) +adcq %rdx, %r12 +mulq %rcx +movq %rax, %r14 +movq %rbx, %rax +movq %rdx, %r11 +mulq %r15 +addq %rax, %r14 +movq %rcx, %rax +movq %r13, %r15 +adcq %rdx, %r11 +mulq %rcx +shlq $20, %r12 +shrq $44, %r15 +orq %r15, %r12 +movq %rax, %r15 +addq %r12, %r14 +movq %rdx, %r12 +movq %rsi, %rax +lea (%rbx,%rbx), %rdx +adcq $0, %r11 +mulq %rdx +addq %rax, %r15 +adcq %rdx, %r12 +movq %r14, %rdx +shlq $20, %r11 +shrq $44, %rdx +orq %rdx, %r11 +addq %r11, %r15 +movq $0xfffffffffff, %r11 +movq %r15, %rdx +adcq $0, %r12 +andq %r11, %r13 +shlq $22, %r12 +andq %r11, %r14 +shrq $42, %rdx +orq %rdx, %r12 +lea (%r12,%r12,4), %r12 +addq %r12, %r13 +movq %r11, %r12 +andq %r13, %r12 +shrq $44, %r13 +addq %r13, %r14 +movq $0x3ffffffffff, %r13 +andq %r14, %r11 +andq %r13, %r15 +shrq $44, %r14 +movq %r11, %rdx +shlq $18, %rdx +addq %r14, %r15 +movl %r12d, %r14d +movq %r11, %r13 +shrq $26, %r12 +andl $67108863, %r14d +orq %rdx, %r12 +movq %r15, %rdx +shrq $34, %r11 +shlq $10, %rdx +andl $67108863, %r12d +orq %rdx, %r11 +shrq $8, %r13 +andl $67108863, %r11d +movl %r11d, 152(%r10) +andl $67108863, %r13d +shrq $16, %r15 +movl %r14d, 140(%r10) +movl %r12d, 144(%r10) +movl %r13d, 148(%r10) +movl %r15d, 156(%r10) +movq -16(%rsp), %r11 +jmp poly1305_init_ext_avx2_6 +poly1305_init_ext_avx2_4: +cmpq $32, %r12 +jbe poly1305_init_ext_avx2_7 +poly1305_init_ext_avx2_5: +lea (%rbx,%rbx,4), %r11 +shlq $2, %r11 +poly1305_init_ext_avx2_6: +movq %r9, %rax +lea (%rcx,%rcx,4), %r13 +mulq %rsi +shlq $2, %r13 +movq %rax, %r14 +movq %rdi, %rax +movq %rdx, %r12 +mulq %r13 +addq %rax, %r14 +movq %r8, %rax +adcq %rdx, %r12 +mulq %r11 +addq %rax, %r14 +movq %r8, %rax +adcq %rdx, %r12 +mulq %rsi +movq %rax, %r15 +movq %r9, %rax +movq %rdx, %r13 +mulq %rcx +addq %rax, %r15 +movq %r11, %rax +movq %r14, %r11 +adcq %rdx, %r13 +mulq %rdi +addq %rax, %r15 +movq %rdi, %rax +adcq %rdx, %r13 +mulq %rsi +shlq $20, %r12 +movq %rax, %rsi +shrq $44, %r11 +movq %r8, %rax +orq %r11, %r12 +movq %rdx, %rdi +mulq %rcx +addq %r12, %r15 +movq %r15, %rcx +adcq $0, %r13 +addq %rax, %rsi +movq %r9, %rax +movq $0xfffffffffff, %r9 +adcq %rdx, %rdi +andq %r9, %r14 +mulq %rbx +addq %rax, %rsi +adcq %rdx, %rdi +movq %r9, %rdx +shlq $20, %r13 +andq %r9, %r15 +shrq $44, %rcx +orq %rcx, %r13 +addq %r13, %rsi +movq %rsi, %rbx +adcq $0, %rdi +shlq $22, %rdi +shrq $42, %rbx +orq %rbx, %rdi +lea (%rdi,%rdi,4), %r8 +addq %r8, %r14 +andq %r14, %rdx +shrq $44, %r14 +addq %r14, %r15 +movq $0x3ffffffffff, %r14 +andq %r15, %r9 +andq %r14, %rsi +shrq $44, %r15 +movq %r9, %rax +addq %r15, %rsi +movl %edx, %r15d +movq %rsi, %rbx +movq %r9, %rcx +shrq $26, %rdx +andl $67108863, %r15d +shlq $18, %rax +shrq $34, %r9 +orq %rax, %rdx +shlq $10, %rbx +shrq $8, %rcx +orq %rbx, %r9 +shrq $16, %rsi +andl $67108863, %edx +andl $67108863, %ecx +andl $67108863, %r9d +movl %r15d, 120(%r10) +movl %edx, 124(%r10) +movl %ecx, 128(%r10) +movl %r9d, 132(%r10) +movl %esi, 136(%r10) +poly1305_init_ext_avx2_7: +movq $0, 176(%r10) +vzeroupper +popq %rbx +popq %r15 +popq %r14 +popq %r13 +popq %r12 +ret +FN_END poly1305_init_ext_avx2 + diff --git a/src/libcryptobox/poly1305/constants.S b/src/libcryptobox/poly1305/constants.S new file mode 100644 index 000000000..a4797a2aa --- /dev/null +++ b/src/libcryptobox/poly1305/constants.S @@ -0,0 +1,21 @@ +SECTION_RODATA + +.p2align 4 +poly1305_constants_x86: +/* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000 +/* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000 +/* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000 +/* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000 +/* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000 +/* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000 +/* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000 +/* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000 +/* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000 +/* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000 +/* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000 +/* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000 +/* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe +/* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001 +/* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001 +/* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001 +/* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003 diff --git a/src/libcryptobox/poly1305/poly1305-donna-16.h b/src/libcryptobox/poly1305/poly1305-donna-16.h deleted file mode 100644 index 5e5c6d3b6..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna-16.h +++ /dev/null @@ -1,202 +0,0 @@ -/* - poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition -*/ - -#if defined(_MSC_VER) - #define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #define POLY1305_NOINLINE __attribute__((noinline)) -#else - #define POLY1305_NOINLINE -#endif - -#define poly1305_block_size 16 - -/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */ -typedef struct poly1305_state_internal_t { - unsigned char buffer[poly1305_block_size]; - size_t leftover; - unsigned short r[10]; - unsigned short h[10]; - unsigned short pad[8]; - unsigned char final; -} poly1305_state_internal_t; - -/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */ -static unsigned short -U8TO16(const unsigned char *p) { - return - (((unsigned short)(p[0] & 0xff) ) | - ((unsigned short)(p[1] & 0xff) << 8)); -} - -/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */ -static void -U16TO8(unsigned char *p, unsigned short v) { - p[0] = (v ) & 0xff; - p[1] = (v >> 8) & 0xff; -} - -void -poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - unsigned short t0,t1,t2,t3,t4,t5,t6,t7; - size_t i; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff; - t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff; - t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03; - t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff; - t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff; - st->r[5] = ((t4 >> 1) ) & 0x1ffe; - t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff; - t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81; - t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff; - st->r[9] = ((t7 >> 5) ) & 0x007f; - - /* h = 0 */ - for (i = 0; i < 10; i++) - st->h[i] = 0; - - /* save pad for later */ - for (i = 0; i < 8; i++) - st->pad[i] = U8TO16(&key[16 + (2 * i)]); - - st->leftover = 0; - st->final = 0; -} - -static void -poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { - const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */ - unsigned short t0,t1,t2,t3,t4,t5,t6,t7; - unsigned long d[10]; - unsigned long c; - - while (bytes >= poly1305_block_size) { - size_t i, j; - - /* h += m[i] */ - t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff; - t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff; - t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff; - t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff; - t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff; - st->h[5] += ((t4 >> 1) ) & 0x1fff; - t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff; - t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff; - t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff; - st->h[9] += ((t7 >> 5) ) | hibit; - - /* h *= r, (partial) h %= p */ - for (i = 0, c = 0; i < 10; i++) { - d[i] = c; - for (j = 0; j < 10; j++) { - d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j])); - /* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */ - if (j == 4) { - c = (d[i] >> 13); - d[i] &= 0x1fff; - } - } - c += (d[i] >> 13); - d[i] &= 0x1fff; - } - c = ((c << 2) + c); /* c *= 5 */ - c += d[0]; - d[0] = ((unsigned short)c & 0x1fff); - c = (c >> 13); - d[1] += c; - - for (i = 0; i < 10; i++) - st->h[i] = (unsigned short)d[i]; - - m += poly1305_block_size; - bytes -= poly1305_block_size; - } -} - -POLY1305_NOINLINE void -poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - unsigned short c; - unsigned short g[10]; - unsigned short mask; - unsigned long f; - size_t i; - - /* process the remaining block */ - if (st->leftover) { - size_t i = st->leftover; - st->buffer[i++] = 1; - for (; i < poly1305_block_size; i++) - st->buffer[i] = 0; - st->final = 1; - poly1305_blocks(st, st->buffer, poly1305_block_size); - } - - /* fully carry h */ - c = st->h[1] >> 13; - st->h[1] &= 0x1fff; - for (i = 2; i < 10; i++) { - st->h[i] += c; - c = st->h[i] >> 13; - st->h[i] &= 0x1fff; - } - st->h[0] += (c * 5); - c = st->h[0] >> 13; - st->h[0] &= 0x1fff; - st->h[1] += c; - c = st->h[1] >> 13; - st->h[1] &= 0x1fff; - st->h[2] += c; - - /* compute h + -p */ - g[0] = st->h[0] + 5; - c = g[0] >> 13; - g[0] &= 0x1fff; - for (i = 1; i < 10; i++) { - g[i] = st->h[i] + c; - c = g[i] >> 13; - g[i] &= 0x1fff; - } - g[9] -= (1 << 13); - - /* select h if h < p, or h + -p if h >= p */ - mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1; - for (i = 0; i < 10; i++) - g[i] &= mask; - mask = ~mask; - for (i = 0; i < 10; i++) - st->h[i] = (st->h[i] & mask) | g[i]; - - /* h = h % (2^128) */ - st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff; - st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff; - st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff; - st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff; - st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff; - st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff; - st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff; - st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff; - - /* mac = (h + pad) % (2^128) */ - f = (unsigned long)st->h[0] + st->pad[0]; - st->h[0] = (unsigned short)f; - for (i = 1; i < 8; i++) { - f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16); - st->h[i] = (unsigned short)f; - } - - for (i = 0; i < 8; i++) - U16TO8(mac + (i * 2), st->h[i]); - - /* zero out the state */ - for (i = 0; i < 10; i++) - st->h[i] = 0; - for (i = 0; i < 10; i++) - st->r[i] = 0; - for (i = 0; i < 8; i++) - st->pad[i] = 0; -} diff --git a/src/libcryptobox/poly1305/poly1305-donna-32.h b/src/libcryptobox/poly1305/poly1305-donna-32.h deleted file mode 100644 index c45aab33a..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna-32.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition -*/ - -#if defined(_MSC_VER) - #define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #define POLY1305_NOINLINE __attribute__((noinline)) -#else - #define POLY1305_NOINLINE -#endif - -#define poly1305_block_size 16 - -/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */ -typedef struct poly1305_state_internal_t { - unsigned long r[5]; - unsigned long h[5]; - unsigned long pad[4]; - size_t leftover; - unsigned char buffer[poly1305_block_size]; - unsigned char final; -} poly1305_state_internal_t; - -/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ -static unsigned long -U8TO32(const unsigned char *p) { - return - (((unsigned long)(p[0] & 0xff) ) | - ((unsigned long)(p[1] & 0xff) << 8) | - ((unsigned long)(p[2] & 0xff) << 16) | - ((unsigned long)(p[3] & 0xff) << 24)); -} - -/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ -static void -U32TO8(unsigned char *p, unsigned long v) { - p[0] = (v ) & 0xff; - p[1] = (v >> 8) & 0xff; - p[2] = (v >> 16) & 0xff; - p[3] = (v >> 24) & 0xff; -} - -void -poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff; - st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03; - st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff; - st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff; - st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff; - - /* h = 0 */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->h[3] = 0; - st->h[4] = 0; - - /* save pad for later */ - st->pad[0] = U8TO32(&key[16]); - st->pad[1] = U8TO32(&key[20]); - st->pad[2] = U8TO32(&key[24]); - st->pad[3] = U8TO32(&key[28]); - - st->leftover = 0; - st->final = 0; -} - -static void -poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { - const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ - unsigned long r0,r1,r2,r3,r4; - unsigned long s1,s2,s3,s4; - unsigned long h0,h1,h2,h3,h4; - unsigned long long d0,d1,d2,d3,d4; - unsigned long c; - - r0 = st->r[0]; - r1 = st->r[1]; - r2 = st->r[2]; - r3 = st->r[3]; - r4 = st->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - while (bytes >= poly1305_block_size) { - /* h += m[i] */ - h0 += (U8TO32(m+ 0) ) & 0x3ffffff; - h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff; - h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff; - h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff; - h4 += (U8TO32(m+12) >> 8) | hibit; - - /* h *= r */ - d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1); - d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2); - d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3); - d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4); - d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0); - - /* (partial) h %= p */ - c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff; - d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff; - d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff; - d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff; - d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff; - h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; - h1 += c; - - m += poly1305_block_size; - bytes -= poly1305_block_size; - } - - st->h[0] = h0; - st->h[1] = h1; - st->h[2] = h2; - st->h[3] = h3; - st->h[4] = h4; -} - -POLY1305_NOINLINE void -poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - unsigned long h0,h1,h2,h3,h4,c; - unsigned long g0,g1,g2,g3,g4; - unsigned long long f; - unsigned long mask; - - /* process the remaining block */ - if (st->leftover) { - size_t i = st->leftover; - st->buffer[i++] = 1; - for (; i < poly1305_block_size; i++) - st->buffer[i] = 0; - st->final = 1; - poly1305_blocks(st, st->buffer, poly1305_block_size); - } - - /* fully carry h */ - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - c = h1 >> 26; h1 = h1 & 0x3ffffff; - h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; - h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; - h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; - h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; - h1 += c; - - /* compute h + -p */ - g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; - g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; - g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; - g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; - g4 = h4 + c - (1 << 26); - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; - h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; - h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; - h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; - - /* mac = (h + pad) % (2^128) */ - f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f; - f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f; - f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f; - f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f; - - U32TO8(mac + 0, h0); - U32TO8(mac + 4, h1); - U32TO8(mac + 8, h2); - U32TO8(mac + 12, h3); - - /* zero out the state */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->h[3] = 0; - st->h[4] = 0; - st->r[0] = 0; - st->r[1] = 0; - st->r[2] = 0; - st->r[3] = 0; - st->r[4] = 0; - st->pad[0] = 0; - st->pad[1] = 0; - st->pad[2] = 0; - st->pad[3] = 0; -} - diff --git a/src/libcryptobox/poly1305/poly1305-donna-64.h b/src/libcryptobox/poly1305/poly1305-donna-64.h deleted file mode 100644 index 016f5b384..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna-64.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition -*/ - -#if defined(_MSC_VER) - #include <intrin.h> - - typedef struct uint128_t { - unsigned long long lo; - unsigned long long hi; - } uint128_t; - - #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi) - #define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; } - #define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); } - #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift))) - #define LO(in) (in.lo) - - #define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #if defined(__SIZEOF_INT128__) - typedef unsigned __int128 uint128_t; - #else - typedef unsigned uint128_t __attribute__((mode(TI))); - #endif - - #define MUL(out, x, y) out = ((uint128_t)x * y) - #define ADD(out, in) out += in - #define ADDLO(out, in) out += in - #define SHR(in, shift) (unsigned long long)(in >> (shift)) - #define LO(in) (unsigned long long)(in) - - #define POLY1305_NOINLINE __attribute__((noinline)) -#endif - -#define poly1305_block_size 16 - -/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */ -typedef struct poly1305_state_internal_t { - unsigned long long r[3]; - unsigned long long h[3]; - unsigned long long pad[2]; - size_t leftover; - unsigned char buffer[poly1305_block_size]; - unsigned char final; -} poly1305_state_internal_t; - -/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ -static unsigned long long -U8TO64(const unsigned char *p) { - return - (((unsigned long long)(p[0] & 0xff) ) | - ((unsigned long long)(p[1] & 0xff) << 8) | - ((unsigned long long)(p[2] & 0xff) << 16) | - ((unsigned long long)(p[3] & 0xff) << 24) | - ((unsigned long long)(p[4] & 0xff) << 32) | - ((unsigned long long)(p[5] & 0xff) << 40) | - ((unsigned long long)(p[6] & 0xff) << 48) | - ((unsigned long long)(p[7] & 0xff) << 56)); -} - -/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ -static void -U64TO8(unsigned char *p, unsigned long long v) { - p[0] = (v ) & 0xff; - p[1] = (v >> 8) & 0xff; - p[2] = (v >> 16) & 0xff; - p[3] = (v >> 24) & 0xff; - p[4] = (v >> 32) & 0xff; - p[5] = (v >> 40) & 0xff; - p[6] = (v >> 48) & 0xff; - p[7] = (v >> 56) & 0xff; -} - -void -poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - unsigned long long t0,t1; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - t0 = U8TO64(&key[0]); - t1 = U8TO64(&key[8]); - - st->r[0] = ( t0 ) & 0xffc0fffffff; - st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; - st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; - - /* h = 0 */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - - /* save pad for later */ - st->pad[0] = U8TO64(&key[16]); - st->pad[1] = U8TO64(&key[24]); - - st->leftover = 0; - st->final = 0; -} - -static void -poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { - const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */ - unsigned long long r0,r1,r2; - unsigned long long s1,s2; - unsigned long long h0,h1,h2; - unsigned long long c; - uint128_t d0,d1,d2,d; - - r0 = st->r[0]; - r1 = st->r[1]; - r2 = st->r[2]; - - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - - s1 = r1 * (5 << 2); - s2 = r2 * (5 << 2); - - while (bytes >= poly1305_block_size) { - unsigned long long t0,t1; - - /* h += m[i] */ - t0 = U8TO64(&m[0]); - t1 = U8TO64(&m[8]); - - h0 += (( t0 ) & 0xfffffffffff); - h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); - h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; - - /* h *= r */ - MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d); - MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d); - MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d); - - /* (partial) h %= p */ - c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff; - ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff; - ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; - h1 += c; - - m += poly1305_block_size; - bytes -= poly1305_block_size; - } - - st->h[0] = h0; - st->h[1] = h1; - st->h[2] = h2; -} - - -POLY1305_NOINLINE void -poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - unsigned long long h0,h1,h2,c; - unsigned long long g0,g1,g2; - unsigned long long t0,t1; - - /* process the remaining block */ - if (st->leftover) { - size_t i = st->leftover; - st->buffer[i] = 1; - for (i = i + 1; i < poly1305_block_size; i++) - st->buffer[i] = 0; - st->final = 1; - poly1305_blocks(st, st->buffer, poly1305_block_size); - } - - /* fully carry h */ - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - - c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += c; - - /* compute h + -p */ - g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; - g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; - g2 = h2 + c - ((unsigned long long)1 << 42); - - /* select h if h < p, or h + -p if h >= p */ - c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1; - g0 &= c; - g1 &= c; - g2 &= c; - c = ~c; - h0 = (h0 & c) | g0; - h1 = (h1 & c) | g1; - h2 = (h2 & c) | g2; - - /* h = (h + pad) */ - t0 = st->pad[0]; - t1 = st->pad[1]; - - h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; - - /* mac = h % (2^128) */ - h0 = ((h0 ) | (h1 << 44)); - h1 = ((h1 >> 20) | (h2 << 24)); - - U64TO8(&mac[0], h0); - U64TO8(&mac[8], h1); - - /* zero out the state */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->r[0] = 0; - st->r[1] = 0; - st->r[2] = 0; - st->pad[0] = 0; - st->pad[1] = 0; -} - diff --git a/src/libcryptobox/poly1305/poly1305-donna-8.h b/src/libcryptobox/poly1305/poly1305-donna-8.h deleted file mode 100644 index ac5d5aeb5..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna-8.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition - - based on the public domain reference version in supercop by djb -*/ - -#if defined(_MSC_VER) - #define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #define POLY1305_NOINLINE __attribute__((noinline)) -#else - #define POLY1305_NOINLINE -#endif - -#define poly1305_block_size 16 - -/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */ -typedef struct poly1305_state_internal_t { - unsigned char buffer[poly1305_block_size]; - size_t leftover; - unsigned char h[17]; - unsigned char r[17]; - unsigned char pad[17]; - unsigned char final; -} poly1305_state_internal_t; - -void -poly1305_init(poly1305_context *ctx, const unsigned char key[32]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - size_t i; - - st->leftover = 0; - - /* h = 0 */ - for (i = 0; i < 17; i++) - st->h[i] = 0; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - st->r[ 0] = key[ 0] & 0xff; - st->r[ 1] = key[ 1] & 0xff; - st->r[ 2] = key[ 2] & 0xff; - st->r[ 3] = key[ 3] & 0x0f; - st->r[ 4] = key[ 4] & 0xfc; - st->r[ 5] = key[ 5] & 0xff; - st->r[ 6] = key[ 6] & 0xff; - st->r[ 7] = key[ 7] & 0x0f; - st->r[ 8] = key[ 8] & 0xfc; - st->r[ 9] = key[ 9] & 0xff; - st->r[10] = key[10] & 0xff; - st->r[11] = key[11] & 0x0f; - st->r[12] = key[12] & 0xfc; - st->r[13] = key[13] & 0xff; - st->r[14] = key[14] & 0xff; - st->r[15] = key[15] & 0x0f; - st->r[16] = 0; - - /* save pad for later */ - for (i = 0; i < 16; i++) - st->pad[i] = key[i + 16]; - st->pad[16] = 0; - - st->final = 0; -} - -static void -poly1305_add(unsigned char h[17], const unsigned char c[17]) { - unsigned short u; - unsigned int i; - for (u = 0, i = 0; i < 17; i++) { - u += (unsigned short)h[i] + (unsigned short)c[i]; - h[i] = (unsigned char)u & 0xff; - u >>= 8; - } -} - -static void -poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) { - unsigned long u; - unsigned int i; - u = 0; - for (i = 0; i < 16; i++) { - u += hr[i]; - h[i] = (unsigned char)u & 0xff; - u >>= 8; - } - u += hr[16]; - h[16] = (unsigned char)u & 0x03; - u >>= 2; - u += (u << 2); /* u *= 5; */ - for (i = 0; i < 16; i++) { - u += h[i]; - h[i] = (unsigned char)u & 0xff; - u >>= 8; - } - h[16] += (unsigned char)u; -} - -static void -poly1305_freeze(unsigned char h[17]) { - static const unsigned char minusp[17] = { - 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0xfc - }; - unsigned char horig[17], negative; - unsigned int i; - - /* compute h + -p */ - for (i = 0; i < 17; i++) - horig[i] = h[i]; - poly1305_add(h, minusp); - - /* select h if h < p, or h + -p if h >= p */ - negative = -(h[16] >> 7); - for (i = 0; i < 17; i++) - h[i] ^= negative & (horig[i] ^ h[i]); -} - -static void -poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) { - const unsigned char hibit = st->final ^ 1; /* 1 << 128 */ - - while (bytes >= poly1305_block_size) { - unsigned long hr[17], u; - unsigned char c[17]; - unsigned int i, j; - - /* h += m */ - for (i = 0; i < 16; i++) - c[i] = m[i]; - c[16] = hibit; - poly1305_add(st->h, c); - - /* h *= r */ - for (i = 0; i < 17; i++) { - u = 0; - for (j = 0; j <= i ; j++) { - u += (unsigned short)st->h[j] * st->r[i - j]; - } - for (j = i + 1; j < 17; j++) { - unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j]; - v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */ - u += v; - } - hr[i] = u; - } - - /* (partial) h %= p */ - poly1305_squeeze(st->h, hr); - - m += poly1305_block_size; - bytes -= poly1305_block_size; - } -} - -POLY1305_NOINLINE void -poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - size_t i; - - /* process the remaining block */ - if (st->leftover) { - size_t i = st->leftover; - st->buffer[i++] = 1; - for (; i < poly1305_block_size; i++) - st->buffer[i] = 0; - st->final = 1; - poly1305_blocks(st, st->buffer, poly1305_block_size); - } - - /* fully reduce h */ - poly1305_freeze(st->h); - - /* h = (h + pad) % (1 << 128) */ - poly1305_add(st->h, st->pad); - for (i = 0; i < 16; i++) - mac[i] = st->h[i]; - - /* zero out the state */ - for (i = 0; i < 17; i++) - st->h[i] = 0; - for (i = 0; i < 17; i++) - st->r[i] = 0; - for (i = 0; i < 17; i++) - st->pad[i] = 0; -} diff --git a/src/libcryptobox/poly1305/poly1305-donna.c b/src/libcryptobox/poly1305/poly1305-donna.c deleted file mode 100644 index c1e3c74b5..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna.c +++ /dev/null @@ -1,201 +0,0 @@ -#include "poly1305-donna.h" - -#if defined(POLY1305_8BIT) -#include "poly1305-donna-8.h" -#elif defined(POLY1305_16BIT) -#include "poly1305-donna-16.h" -#elif defined(POLY1305_32BIT) -#include "poly1305-donna-32.h" -#elif defined(POLY1305_64BIT) -#include "poly1305-donna-64.h" -#else - -/* auto detect between 32bit / 64bit */ -#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__)) -#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64)) -#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)))) - -#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT) -#include "poly1305-donna-64.h" -#else -#include "poly1305-donna-32.h" -#endif - -#endif - -void -poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) { - poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx; - size_t i; - - /* handle leftover */ - if (st->leftover) { - size_t want = (poly1305_block_size - st->leftover); - if (want > bytes) - want = bytes; - for (i = 0; i < want; i++) - st->buffer[st->leftover + i] = m[i]; - bytes -= want; - m += want; - st->leftover += want; - if (st->leftover < poly1305_block_size) - return; - poly1305_blocks(st, st->buffer, poly1305_block_size); - st->leftover = 0; - } - - /* process full blocks */ - if (bytes >= poly1305_block_size) { - size_t want = (bytes & ~(poly1305_block_size - 1)); - poly1305_blocks(st, m, want); - m += want; - bytes -= want; - } - - /* store leftover */ - if (bytes) { - for (i = 0; i < bytes; i++) - st->buffer[st->leftover + i] = m[i]; - st->leftover += bytes; - } -} - -void -poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) { - poly1305_context ctx; - poly1305_init(&ctx, key); - poly1305_update(&ctx, m, bytes); - poly1305_finish(&ctx, mac); -} - -int -poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) { - size_t i; - unsigned int dif = 0; - for (i = 0; i < 16; i++) - dif |= (mac1[i] ^ mac2[i]); - dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); - return (dif & 1); -} - - -/* test a few basic operations */ -int -poly1305_power_on_self_test(void) { - /* example from nacl */ - static const unsigned char nacl_key[32] = { - 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91, - 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25, - 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65, - 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80, - }; - - static const unsigned char nacl_msg[131] = { - 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73, - 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce, - 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4, - 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a, - 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b, - 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72, - 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2, - 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38, - 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a, - 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae, - 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea, - 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda, - 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde, - 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3, - 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6, - 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74, - 0xe3,0x55,0xa5 - }; - - static const unsigned char nacl_mac[16] = { - 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5, - 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9 - }; - - /* generates a final value of (2^130 - 2) == 3 */ - static const unsigned char wrap_key[32] = { - 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - }; - - static const unsigned char wrap_msg[16] = { - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff - }; - - static const unsigned char wrap_mac[16] = { - 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - }; - - /* - mac of the macs of messages of length 0 to 256, where the key and messages - have all their values set to the length - */ - static const unsigned char total_key[32] = { - 0x01,0x02,0x03,0x04,0x05,0x06,0x07, - 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9, - 0xff,0xff,0xff,0xff,0xff,0xff,0xff, - 0xff,0xff,0xff,0xff,0xff,0xff,0xff - }; - - static const unsigned char total_mac[16] = { - 0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd, - 0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39 - }; - - poly1305_context ctx; - poly1305_context total_ctx; - unsigned char all_key[32]; - unsigned char all_msg[256]; - unsigned char mac[16]; - size_t i, j; - int result = 1; - - for (i = 0; i < sizeof(mac); i++) - mac[i] = 0; - poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key); - result &= poly1305_verify(nacl_mac, mac); - - for (i = 0; i < sizeof(mac); i++) - mac[i] = 0; - poly1305_init(&ctx, nacl_key); - poly1305_update(&ctx, nacl_msg + 0, 32); - poly1305_update(&ctx, nacl_msg + 32, 64); - poly1305_update(&ctx, nacl_msg + 96, 16); - poly1305_update(&ctx, nacl_msg + 112, 8); - poly1305_update(&ctx, nacl_msg + 120, 4); - poly1305_update(&ctx, nacl_msg + 124, 2); - poly1305_update(&ctx, nacl_msg + 126, 1); - poly1305_update(&ctx, nacl_msg + 127, 1); - poly1305_update(&ctx, nacl_msg + 128, 1); - poly1305_update(&ctx, nacl_msg + 129, 1); - poly1305_update(&ctx, nacl_msg + 130, 1); - poly1305_finish(&ctx, mac); - result &= poly1305_verify(nacl_mac, mac); - - for (i = 0; i < sizeof(mac); i++) - mac[i] = 0; - poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key); - result &= poly1305_verify(wrap_mac, mac); - - poly1305_init(&total_ctx, total_key); - for (i = 0; i < 256; i++) { - /* set key and message to 'i,i,i..' */ - for (j = 0; j < sizeof(all_key); j++) - all_key[j] = i; - for (j = 0; j < i; j++) - all_msg[j] = i; - poly1305_auth(mac, all_msg, i, all_key); - poly1305_update(&total_ctx, mac, 16); - } - poly1305_finish(&total_ctx, mac); - result &= poly1305_verify(total_mac, mac); - - return result; -} diff --git a/src/libcryptobox/poly1305/poly1305-donna.h b/src/libcryptobox/poly1305/poly1305-donna.h deleted file mode 100644 index 94e23533f..000000000 --- a/src/libcryptobox/poly1305/poly1305-donna.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef POLY1305_DONNA_H -#define POLY1305_DONNA_H - -#include <stddef.h> - -typedef struct poly1305_context { - size_t aligner; - unsigned char opaque[136]; -} poly1305_context; - -void poly1305_init(poly1305_context *ctx, const unsigned char key[32]); -void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes); -void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]); -void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]); - -int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); -int poly1305_power_on_self_test(void); - -#endif /* POLY1305_DONNA_H */ - diff --git a/src/libcryptobox/poly1305/poly1305.c b/src/libcryptobox/poly1305/poly1305.c new file mode 100644 index 000000000..ef3b366bc --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2015, Vsevolod Stakhov + * Copyright (c) 2015, Andrew Moon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "cryptobox.h" +#include "poly1305.h" +#include "platform_config.h" + +extern unsigned long cpu_config; + +typedef struct poly1305_state_internal_t +{ + unsigned char opaque[192]; /* largest state required (AVX2) */ + size_t leftover, block_size; + unsigned char buffer[64]; /* largest blocksize (AVX2) */ +} poly1305_state_internal; + +typedef struct poly1305_impl_t +{ + unsigned long cpu_flags; + const char *desc; + + size_t (*block_size)(void); + void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint); + void (*blocks)(void *state, const unsigned char *in, size_t inlen); + void (*finish_ext)(void *state, const unsigned char *in, size_t remaining, + unsigned char *mac); + void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen, + const poly1305_key *key); +} poly1305_impl_t; + +#define POLY1305_DECLARE(ext) \ + size_t poly1305_block_size_##ext(void); \ + void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \ + void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \ + void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \ + void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key); + +#define POLY1305_IMPL(cpuflags, desc, ext) \ + {(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext} + +#if defined(HAVE_AVX2) +POLY1305_DECLARE(avx2) +#define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2) +#endif +#if defined(HAVE_AVX) +POLY1305_DECLARE(avx) +#define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx) +#endif +#if defined(HAVE_SSE2) +POLY1305_DECLARE(sse2) +#define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2) +#endif + +POLY1305_DECLARE(ref) +#define POLY1305_GENERIC POLY1305_IMPL(0, "generic", ref) + +/* list implemenations from most optimized to least, with generic as the last entry */ +static const poly1305_impl_t poly1305_list[] = +{ +POLY1305_GENERIC, + +#if defined(POLY1305_AVX2) + POLY1305_AVX2, +#endif +#if defined(POLY1305_AVX) + POLY1305_AVX, +#endif +#if defined(POLY1305_SSE2) + POLY1305_SSE2, +#endif + }; + +static const poly1305_impl_t *poly1305_opt = &poly1305_list[0]; +; + +/* is the pointer aligned on a word boundary? */ +static int poly1305_is_aligned(const void *p) +{ + return ((size_t) p & (sizeof(size_t) - 1)) == 0; +} + +void poly1305_load(void) +{ + guint i; + + if (cpu_config != 0) { + for (i = 0; i < G_N_ELEMENTS(poly1305_list); i++) { + if (poly1305_list[i].cpu_flags & cpu_config) { + poly1305_opt = &poly1305_list[i]; + break; + } + } + } +} + +/* processes inlen bytes (full blocks only), handling input alignment */ +static void poly1305_consume(poly1305_state_internal *state, + const unsigned char *in, size_t inlen) +{ + int in_aligned; + + /* it's ok to call with 0 bytes */ + if (!inlen) + return; + + /* if everything is aligned, handle directly */ + in_aligned = poly1305_is_aligned (in); + if (in_aligned) { + poly1305_opt->blocks (state->opaque, in, inlen); + return; + } + + /* copy the unaligned data to an aligned buffer and process in chunks */ + while (inlen) { + unsigned char buffer[1024]; + const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen; + memcpy (buffer, in, bytes); + poly1305_opt->blocks (state->opaque, buffer, bytes); + in += bytes; + inlen -= bytes; + } +} + +void poly1305_init(poly1305_state *S, const poly1305_key *key) +{ + poly1305_state_internal *state = (poly1305_state_internal *) S; + poly1305_opt->init_ext (state->opaque, key, 0); + state->leftover = 0; + state->block_size = poly1305_opt->block_size (); +} + +void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, + size_t bytes_hint) +{ + poly1305_state_internal *state = (poly1305_state_internal *) S; + poly1305_opt->init_ext (state->opaque, key, bytes_hint); + state->leftover = 0; + state->block_size = poly1305_opt->block_size (); +} + +void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen) +{ + poly1305_state_internal *state = (poly1305_state_internal *) S; + + /* handle leftover */ + if (state->leftover) { + size_t want = (state->block_size - state->leftover); + if (want > inlen) + want = inlen; + memcpy (state->buffer + state->leftover, in, want); + inlen -= want; + in += want; + state->leftover += want; + if (state->leftover < state->block_size) + return; + poly1305_opt->blocks (state->opaque, state->buffer, state->block_size); + state->leftover = 0; + } + + /* process full blocks */ + if (inlen >= state->block_size) { + size_t want = (inlen & ~(state->block_size - 1)); + poly1305_consume (state, in, want); + in += want; + inlen -= want; + } + + /* store leftover */ + if (inlen) { + memcpy (state->buffer + state->leftover, in, inlen); + state->leftover += inlen; + } +} + +void poly1305_finish(poly1305_state *S, unsigned char *mac) +{ + poly1305_state_internal *state = (poly1305_state_internal *) S; + poly1305_opt->finish_ext (state->opaque, state->buffer, state->leftover, + mac); +} + +void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, + const poly1305_key *key) +{ + poly1305_opt->auth (mac, in, inlen, key); +} + +int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) +{ + size_t i; + unsigned int dif = 0; + + for (i = 0; i < 16; i++) { + dif |= (mac1[i] ^ mac2[i]); + } + + dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); + return (dif & 1); +} diff --git a/src/libcryptobox/poly1305/poly1305.h b/src/libcryptobox/poly1305/poly1305.h new file mode 100644 index 000000000..8eae97c88 --- /dev/null +++ b/src/libcryptobox/poly1305/poly1305.h @@ -0,0 +1,38 @@ +#ifndef POLY1305_H +#define POLY1305_H + +#include <stddef.h> + +#if defined(__cplusplus) +extern "C" +{ +#endif + +typedef struct poly1305_state +{ + unsigned char opaque[320]; +} poly1305_state; + +typedef struct poly1305_key +{ + unsigned char b[32]; +} poly1305_key; + +void poly1305_init(poly1305_state *S, const poly1305_key *key); +void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, + size_t bytes_hint); +void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen); +void poly1305_finish(poly1305_state *S, unsigned char *mac); + +void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, + const poly1305_key *key); +int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); + +void poly1305_load(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* POLY1305_H */ + diff --git a/src/libcryptobox/poly1305/ref-32.c b/src/libcryptobox/poly1305/ref-32.c new file mode 100644 index 000000000..8086e1c46 --- /dev/null +++ b/src/libcryptobox/poly1305/ref-32.c @@ -0,0 +1,237 @@ +/* + poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition + + assumes the existence of uint32_t and uint64_t +*/ + +#include "config.h" + +enum { + POLY1305_BLOCK_SIZE = 16 +}; + +typedef struct poly1305_state_ref_t { + uint32_t r[5]; + uint32_t h[5]; + uint32_t pad[4]; + unsigned char final; +} poly1305_state_ref_t; + +/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ +static uint32_t +U8TO32(const unsigned char *p) { + return + (((uint32_t)(p[0] & 0xff) ) | + ((uint32_t)(p[1] & 0xff) << 8) | + ((uint32_t)(p[2] & 0xff) << 16) | + ((uint32_t)(p[3] & 0xff) << 24)); +} + +/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ +static void +U32TO8(unsigned char *p, uint32_t v) { + p[0] = (unsigned char)((v ) & 0xff); + p[1] = (unsigned char)((v >> 8) & 0xff); + p[2] = (unsigned char)((v >> 16) & 0xff); + p[3] = (unsigned char)((v >> 24) & 0xff); +} + +static size_t +poly1305_block_size_ref(void) { + return POLY1305_BLOCK_SIZE; +} + +static void +poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + + /* bytes_hint not used */ + (void)bytes_hint; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff; + st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03; + st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff; + st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff; + st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* save pad for later */ + st->pad[0] = U8TO32(&key->b[16]); + st->pad[1] = U8TO32(&key->b[20]); + st->pad[2] = U8TO32(&key->b[24]); + st->pad[3] = U8TO32(&key->b[28]); + + st->final = 0; +} + +static void +poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ + uint32_t r0,r1,r2,r3,r4; + uint32_t s1,s2,s3,s4; + uint32_t h0,h1,h2,h3,h4; + uint64_t d0,d1,d2,d3,d4; + uint32_t c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (inlen >= POLY1305_BLOCK_SIZE) { + /* h += m[i] */ + h0 += (U8TO32(in+ 0) ) & 0x3ffffff; + h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff; + h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff; + h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff; + h4 += (U8TO32(in+12) >> 8) | hibit; + + /* h *= r */ + d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1); + d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2); + d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3); + d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4); + d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0); + + /* (partial) h %= p */ + c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff; + d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff; + d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff; + d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff; + d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff; + h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; + h1 += c; + + in += POLY1305_BLOCK_SIZE; + inlen -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +} + +static void +poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + uint32_t h0,h1,h2,h3,h4,c; + uint32_t g0,g1,g2,g3,g4; + uint64_t f; + uint32_t mask; + + /* process the remaining block */ + if (remaining) { + unsigned char final[POLY1305_BLOCK_SIZE] = {0}; + size_t i; + for (i = 0; i < remaining; i++) + final[i] = in[i]; + final[remaining] = 1; + st->final = 1; + poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; h1 = h1 & 0x3ffffff; + h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; + h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; + h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; + h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; + g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; + g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; + g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; + g4 = h4 + c - (1 << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + pad) % (2^128) */ + f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f; + f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f; + f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f; + f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f; + + U32TO8(mac + 0, h0); + U32TO8(mac + 4, h1); + U32TO8(mac + 8, h2); + U32TO8(mac + 12, h3); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->r[3] = 0; + st->r[4] = 0; + st->pad[0] = 0; + st->pad[1] = 0; + st->pad[2] = 0; + st->pad[3] = 0; +} + +static void +poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { + poly1305_state_ref_t st; + size_t blocks; + poly1305_init_ext_ref(&st, key, inlen); + blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); + if (blocks) { + poly1305_blocks_ref(&st, in, blocks); + in += blocks; + inlen -= blocks; + } + poly1305_finish_ext_ref(&st, in, inlen, mac); +} + diff --git a/src/libcryptobox/poly1305/ref-64.c b/src/libcryptobox/poly1305/ref-64.c new file mode 100644 index 000000000..f6ead5955 --- /dev/null +++ b/src/libcryptobox/poly1305/ref-64.c @@ -0,0 +1,231 @@ +/* + poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition + + assumes the existence of uint64_t and uint128_t +*/ + +#include "config.h" +enum { + POLY1305_BLOCK_SIZE = 16 +}; + +#if defined(_MSC_VER) + #include <intrin.h> + + typedef struct uint128_t { + unsigned long long lo; + unsigned long long hi; + } uint128_t; + + #define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) + #if defined(__SIZEOF_INT128__) + typedef unsigned __int128 uint128_t; + #else + typedef unsigned uint128_t __attribute__((mode(TI))); + #endif + + #define POLY1305_NOINLINE __attribute__((noinline)) +#endif + +typedef struct poly1305_state_ref_t { + uint64_t r[3]; + uint64_t h[3]; + uint64_t pad[2]; + unsigned char final; +} poly1305_state_ref_t; + +/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ +static uint64_t +U8TO64(const unsigned char *p) { + return + ((uint64_t)p[0] ) | + ((uint64_t)p[1] << 8) | + ((uint64_t)p[2] << 16) | + ((uint64_t)p[3] << 24) | + ((uint64_t)p[4] << 32) | + ((uint64_t)p[5] << 40) | + ((uint64_t)p[6] << 48) | + ((uint64_t)p[7] << 56); +} + +/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ +static void +U64TO8(unsigned char *p, uint64_t v) { + p[0] = (unsigned char)(v ) & 0xff; + p[1] = (unsigned char)(v >> 8) & 0xff; + p[2] = (unsigned char)(v >> 16) & 0xff; + p[3] = (unsigned char)(v >> 24) & 0xff; + p[4] = (unsigned char)(v >> 32) & 0xff; + p[5] = (unsigned char)(v >> 40) & 0xff; + p[6] = (unsigned char)(v >> 48) & 0xff; + p[7] = (unsigned char)(v >> 56) & 0xff; +} + +static size_t +poly1305_block_size_ref(void) { + return POLY1305_BLOCK_SIZE; +} + +static void +poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + uint64_t t0, t1; + + /* bytes_hint not used */ + (void)bytes_hint; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = U8TO64(&key->b[0]); + t1 = U8TO64(&key->b[8]); + st->r[0] = ( t0 ) & 0xffc0fffffff; + st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; + st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + /* save pad for later */ + st->pad[0] = U8TO64(&key->b[16]); + st->pad[1] = U8TO64(&key->b[24]); + + st->final = 0; +} + +static void +poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */ + uint64_t r0,r1,r2; + uint64_t s1,s2; + uint64_t h0,h1,h2; + uint64_t c; + uint128_t d0,d1,d2; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + while (inlen >= POLY1305_BLOCK_SIZE) { + uint64_t t0, t1; + + /* h += in[i] */ + t0 = U8TO64(in + 0); + t1 = U8TO64(in + 8); + h0 += (( t0 ) & 0xfffffffffff); + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); + h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; + + /* h *= r */ + d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1); + d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2); + d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0); + + /* (partial) h %= p */ + c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff; + d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff; + d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; + h1 += c; + + in += POLY1305_BLOCK_SIZE; + inlen -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + +static void +poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { + poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; + uint64_t h0, h1, h2, c; + uint64_t g0, g1, g2; + uint64_t t0, t1; + + /* process the remaining block */ + if (remaining) { + unsigned char final[POLY1305_BLOCK_SIZE] = {0}; + size_t i; + for (i = 0; i < remaining; i++) + final[i] = in[i]; + final[remaining] = 1; + st->final = 1; + poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; + h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; + g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t)1 << 42); + + /* select h if h < p, or h + -p if h >= p */ + c = (g2 >> 63) - 1; + h0 = (h0 & ~c) | (g0 & c); + h1 = (h1 & ~c) | (g1 & c); + h2 = (h2 & ~c) | (g2 & c); + + /* h = (h + pad) */ + t0 = st->pad[0]; + t1 = st->pad[1]; + + h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; + h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; + + /* mac = h % (2^128) */ + h0 = ((h0 ) | (h1 << 44)); + h1 = ((h1 >> 20) | (h2 << 24)); + + U64TO8(&mac[0], h0); + U64TO8(&mac[8], h1); + + /* zero out the state */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->r[0] = 0; + st->r[1] = 0; + st->r[2] = 0; + st->pad[0] = 0; + st->pad[1] = 0; +} + + +static void +poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { + poly1305_state_ref_t st; + size_t blocks; + poly1305_init_ext_ref(&st, key, inlen); + blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); + if (blocks) { + poly1305_blocks_ref(&st, in, blocks); + in += blocks; + inlen -= blocks; + } + poly1305_finish_ext_ref(&st, in, inlen, mac); +} + diff --git a/src/libcryptobox/poly1305/sse2.S b/src/libcryptobox/poly1305/sse2.S new file mode 100644 index 000000000..a4ec004d9 --- /dev/null +++ b/src/libcryptobox/poly1305/sse2.S @@ -0,0 +1,966 @@ +#include "../chacha20/macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN poly1305_block_size_sse2 +movl $32, %eax +ret +FN_END poly1305_block_size_sse2 + +GLOBAL_HIDDEN_FN poly1305_init_ext_sse2 +poly1305_init_ext_sse2_local: +pushq %r15 +xorps %xmm0, %xmm0 +testq %rdx, %rdx +pushq %r14 +movq %rdx, %r11 +movq $-1, %rax +cmove %rax, %r11 +pushq %r13 +movabsq $17575274610687, %r9 +pushq %r12 +pushq %rbp +movq %r11, %r13 +movabsq $17592186044415, %rbp +pushq %rbx +xorl %ebx, %ebx +movdqu %xmm0, 32(%rdi) +movdqu %xmm0, (%rdi) +movdqu %xmm0, 16(%rdi) +movq 8(%rsi), %rcx +movq (%rsi), %rax +movq %rcx, %rdx +shrq $24, %rcx +andq %rax, %r9 +salq $20, %rdx +shrq $44, %rax +movq %r9, %r8 +orq %rax, %rdx +shrq $26, %r8 +movabsq $17592181915647, %rax +andq %rax, %rdx +movabsq $68719475727, %rax +andq %rax, %rcx +movl %r9d, %eax +andl $67108863, %eax +movl %eax, 40(%rdi) +movl %edx, %eax +sall $18, %eax +orl %r8d, %eax +movq %rdx, %r8 +andl $67108863, %eax +shrq $34, %r8 +movl %eax, 44(%rdi) +movq %rdx, %rax +shrq $8, %rax +andl $67108863, %eax +movl %eax, 48(%rdi) +movl %ecx, %eax +sall $10, %eax +orl %r8d, %eax +movq %rdi, %r8 +andl $67108863, %eax +movl %eax, 52(%rdi) +movq %rcx, %rax +shrq $16, %rax +movl %eax, 56(%rdi) +movq 16(%rsi), %rax +movq %rax, 104(%rdi) +movq 24(%rsi), %rax +movq %rdx, %rsi +movq %rax, 112(%rdi) +poly1305_init_ext_sse2_7: +testq %rbx, %rbx +jne poly1305_init_ext_sse2_4 +cmpq $16, %r13 +jbe poly1305_init_ext_sse2_5 +leaq 60(%r8), %rdi +jmp poly1305_init_ext_sse2_6 +poly1305_init_ext_sse2_4: +cmpq $96, %r13 +jb poly1305_init_ext_sse2_5 +leaq 80(%r8), %rdi +poly1305_init_ext_sse2_6: +imulq $20, %rcx, %r10 +movq $0, -48(%rsp) +movq $0, -32(%rsp) +leaq (%rsi,%rsi), %r14 +leaq (%r9,%r9), %r11 +movq %r10, %rax +mulq %r14 +movq %rax, %r14 +movq %r9, %rax +movq %rdx, %r15 +mulq %r9 +addq %rax, %r14 +movq %r14, %rax +adcq %rdx, %r15 +leaq (%rcx,%rcx), %rdx +andq %rbp, %rax +movq %rax, -16(%rsp) +movq %r11, %rax +movq %rdx, -24(%rsp) +mulq %rsi +movq %rax, %r11 +movq %r10, %rax +movq %rdx, %r12 +mulq %rcx +movq -16(%rsp), %rcx +addq %rax, %r11 +movq %r14, %rax +adcq %rdx, %r12 +shrdq $44, %r15, %rax +movq %rax, -56(%rsp) +movq -24(%rsp), %rax +addq -56(%rsp), %r11 +adcq -48(%rsp), %r12 +mulq %r9 +movq %r11, %r14 +andq %rbp, %r14 +movq %rax, %r9 +movq %rsi, %rax +movq %rdx, %r10 +mulq %rsi +addq %rax, %r9 +movq %r11, %rax +adcq %rdx, %r10 +shrdq $44, %r12, %rax +movq %rax, -40(%rsp) +movabsq $4398046511103, %rax +addq -40(%rsp), %r9 +adcq -32(%rsp), %r10 +andq %r9, %rax +incq %rbx +shrdq $42, %r10, %r9 +leaq (%r9,%r9,4), %r9 +addq %r9, %rcx +movq %rcx, %r9 +shrq $44, %rcx +addq %r14, %rcx +andq %rbp, %r9 +movq %rcx, %rsi +shrq $44, %rcx +movq %r9, %rdx +addq %rax, %rcx +movl %r9d, %eax +andq %rbp, %rsi +andl $67108863, %eax +shrq $26, %rdx +movl %eax, (%rdi) +movl %esi, %eax +sall $18, %eax +orl %edx, %eax +movq %rsi, %rdx +andl $67108863, %eax +shrq $34, %rdx +movl %eax, 4(%rdi) +movq %rsi, %rax +shrq $8, %rax +andl $67108863, %eax +movl %eax, 8(%rdi) +movl %ecx, %eax +sall $10, %eax +orl %edx, %eax +andl $67108863, %eax +movl %eax, 12(%rdi) +movq %rcx, %rax +shrq $16, %rax +cmpq $2, %rbx +movl %eax, 16(%rdi) +jne poly1305_init_ext_sse2_7 +poly1305_init_ext_sse2_5: +movq $0, 120(%r8) +popq %rbx +popq %rbp +popq %r12 +popq %r13 +popq %r14 +popq %r15 +ret +FN_END poly1305_init_ext_sse2 + + +GLOBAL_HIDDEN_FN poly1305_blocks_sse2 +poly1305_blocks_sse2_local: +pushq %rbp +movq %rsp, %rbp +pushq %rbx +andq $-64, %rsp +subq $328, %rsp +movq $(1 << 24), %rax +movd %rax, %xmm1 +movq $((1 << 26) - 1), %rax +movd %rax, %xmm0 +pshufd $68, %xmm1, %xmm1 +pshufd $68, %xmm0, %xmm0 +movq 120(%rdi), %rax +movaps %xmm1, 312(%rsp) +testb $4, %al +je poly1305_blocks_sse2_11 +movaps 312(%rsp), %xmm1 +psrldq $8, %xmm1 +movaps %xmm1, 312(%rsp) +poly1305_blocks_sse2_11: +testb $8, %al +je poly1305_blocks_sse2_12 +xorps %xmm1, %xmm1 +movaps %xmm1, 312(%rsp) +poly1305_blocks_sse2_12: +testb $1, %al +jne poly1305_blocks_sse2_13 +movq 16(%rsi), %xmm1 +movaps %xmm0, %xmm3 +movaps %xmm0, %xmm9 +movq (%rsi), %xmm15 +orq $1, %rax +subq $32, %rdx +movq 8(%rsi), %xmm12 +punpcklqdq %xmm1, %xmm15 +movq 24(%rsi), %xmm1 +movaps %xmm15, %xmm8 +pand %xmm15, %xmm3 +psrlq $52, %xmm15 +addq $32, %rsi +punpcklqdq %xmm1, %xmm12 +movaps %xmm12, %xmm1 +psrlq $26, %xmm8 +psllq $12, %xmm1 +pand %xmm0, %xmm8 +movq %rax, 120(%rdi) +por %xmm1, %xmm15 +psrlq $40, %xmm12 +pand %xmm15, %xmm9 +por 312(%rsp), %xmm12 +psrlq $26, %xmm15 +pand %xmm0, %xmm15 +jmp poly1305_blocks_sse2_14 +poly1305_blocks_sse2_13: +movdqu (%rdi), %xmm8 +movdqu 16(%rdi), %xmm15 +movdqu 32(%rdi), %xmm12 +pshufd $80, %xmm8, %xmm3 +pshufd $250, %xmm8, %xmm8 +pshufd $80, %xmm15, %xmm9 +pshufd $250, %xmm15, %xmm15 +pshufd $80, %xmm12, %xmm12 +poly1305_blocks_sse2_14: +movq 120(%rdi), %rax +testb $48, %al +je poly1305_blocks_sse2_15 +testb $16, %al +movd 56(%rdi), %xmm2 +leaq 40(%rdi), %rax +je poly1305_blocks_sse2_16 +movdqu 60(%rdi), %xmm1 +movdqu (%rax), %xmm4 +movd %xmm2, %eax +movd 76(%rdi), %xmm2 +movaps %xmm1, %xmm7 +movd %eax, %xmm5 +punpckldq %xmm4, %xmm7 +punpckhdq %xmm4, %xmm1 +punpcklqdq %xmm5, %xmm2 +jmp poly1305_blocks_sse2_17 +poly1305_blocks_sse2_16: +movdqu (%rax), %xmm1 +movl $1, %r8d +movd %r8d, %xmm4 +movaps %xmm1, %xmm7 +punpckldq %xmm4, %xmm7 +punpckhdq %xmm4, %xmm1 +poly1305_blocks_sse2_17: +pshufd $80, %xmm7, %xmm11 +pshufd $80, %xmm1, %xmm4 +pshufd $250, %xmm7, %xmm7 +movaps %xmm11, 168(%rsp) +pshufd $250, %xmm1, %xmm1 +jmp poly1305_blocks_sse2_18 +poly1305_blocks_sse2_15: +movdqu 60(%rdi), %xmm1 +movd 76(%rdi), %xmm2 +pshufd $0, %xmm2, %xmm2 +pshufd $0, %xmm1, %xmm11 +pshufd $85, %xmm1, %xmm7 +pshufd $170, %xmm1, %xmm4 +movaps %xmm11, 168(%rsp) +pshufd $255, %xmm1, %xmm1 +poly1305_blocks_sse2_18: +movaps %xmm1, %xmm14 +movaps %xmm7, %xmm5 +movaps %xmm4, %xmm13 +movaps %xmm1, 264(%rsp) +movaps %xmm2, %xmm1 +cmpq $63, %rdx +movq $(5), %r8 +movd %r8, %xmm6 +pshufd $68, %xmm6, %xmm6 +pmuludq %xmm6, %xmm5 +movaps %xmm4, 296(%rsp) +pmuludq %xmm6, %xmm13 +movaps %xmm2, 152(%rsp) +pmuludq %xmm6, %xmm14 +pmuludq %xmm6, %xmm1 +movaps %xmm5, 88(%rsp) +movaps %xmm13, 72(%rsp) +movaps %xmm14, 56(%rsp) +movaps %xmm1, 40(%rsp) +jbe poly1305_blocks_sse2_19 +movdqu 80(%rdi), %xmm1 +movd 96(%rdi), %xmm2 +movq %rdx, %rcx +pshufd $0, %xmm2, %xmm2 +movaps %xmm2, 24(%rsp) +pmuludq %xmm6, %xmm2 +pshufd $85, %xmm1, %xmm4 +movaps %xmm4, 280(%rsp) +pmuludq %xmm6, %xmm4 +pshufd $255, %xmm1, %xmm13 +pshufd $170, %xmm1, %xmm5 +movaps 72(%rsp), %xmm14 +movaps %xmm5, 216(%rsp) +pmuludq %xmm6, %xmm5 +movq %rsi, %rax +movaps %xmm4, -24(%rsp) +movaps %xmm13, %xmm4 +pshufd $0, %xmm1, %xmm1 +pmuludq %xmm6, %xmm4 +movaps %xmm14, -8(%rsp) +movaps %xmm5, 8(%rsp) +movaps 168(%rsp), %xmm5 +movaps %xmm1, 248(%rsp) +movaps 56(%rsp), %xmm1 +movaps %xmm4, 120(%rsp) +movaps 40(%rsp), %xmm4 +movaps %xmm13, 136(%rsp) +movaps %xmm2, 200(%rsp) +movaps %xmm1, 104(%rsp) +movaps %xmm4, 184(%rsp) +movaps %xmm5, 232(%rsp) +jmp poly1305_blocks_sse2_20 +.p2align 6 +poly1305_blocks_sse2_20: +movaps -24(%rsp), %xmm5 +movaps %xmm8, %xmm13 +subq $64, %rcx +movaps 8(%rsp), %xmm4 +movaps 120(%rsp), %xmm10 +pmuludq %xmm12, %xmm5 +pmuludq %xmm15, %xmm4 +movaps 8(%rsp), %xmm2 +pmuludq %xmm9, %xmm10 +movaps 120(%rsp), %xmm11 +movaps 200(%rsp), %xmm14 +pmuludq %xmm12, %xmm2 +paddq %xmm4, %xmm5 +pmuludq %xmm15, %xmm11 +movaps 120(%rsp), %xmm1 +paddq %xmm10, %xmm5 +pmuludq %xmm8, %xmm14 +movaps 200(%rsp), %xmm10 +movaps 200(%rsp), %xmm4 +pmuludq %xmm12, %xmm1 +movaps 248(%rsp), %xmm8 +pmuludq %xmm15, %xmm10 +paddq %xmm11, %xmm2 +pmuludq %xmm12, %xmm4 +paddq %xmm14, %xmm5 +movaps 200(%rsp), %xmm11 +movaps 248(%rsp), %xmm14 +pmuludq %xmm15, %xmm8 +pmuludq 248(%rsp), %xmm12 +pmuludq %xmm9, %xmm11 +paddq %xmm10, %xmm1 +movaps 248(%rsp), %xmm10 +pmuludq 280(%rsp), %xmm15 +pmuludq %xmm3, %xmm14 +paddq %xmm15, %xmm12 +paddq %xmm8, %xmm4 +pmuludq %xmm13, %xmm10 +movq 24(%rax), %xmm15 +movaps 248(%rsp), %xmm8 +paddq %xmm11, %xmm2 +movaps %xmm3, %xmm11 +movaps 280(%rsp), %xmm3 +paddq %xmm14, %xmm5 +pmuludq %xmm9, %xmm8 +paddq %xmm10, %xmm2 +movq 16(%rax), %xmm14 +movaps 280(%rsp), %xmm10 +pmuludq %xmm9, %xmm3 +pmuludq 216(%rsp), %xmm9 +paddq %xmm9, %xmm12 +paddq %xmm8, %xmm1 +movq (%rax), %xmm8 +pmuludq %xmm11, %xmm10 +paddq %xmm3, %xmm4 +movaps 216(%rsp), %xmm3 +punpcklqdq %xmm14, %xmm8 +movaps 280(%rsp), %xmm14 +pmuludq %xmm13, %xmm3 +paddq %xmm10, %xmm2 +movq 8(%rax), %xmm10 +pmuludq %xmm13, %xmm14 +pmuludq 136(%rsp), %xmm13 +paddq %xmm13, %xmm12 +punpcklqdq %xmm15, %xmm10 +movaps %xmm10, %xmm9 +movaps 216(%rsp), %xmm15 +paddq %xmm3, %xmm4 +psllq $12, %xmm9 +movaps %xmm0, %xmm3 +paddq %xmm14, %xmm1 +pmuludq %xmm11, %xmm15 +pand %xmm8, %xmm3 +movaps 136(%rsp), %xmm14 +movaps %xmm3, -40(%rsp) +movaps %xmm8, %xmm3 +movdqu 48(%rax), %xmm13 +psrlq $52, %xmm8 +pmuludq %xmm11, %xmm14 +paddq %xmm15, %xmm1 +por %xmm9, %xmm8 +pmuludq 24(%rsp), %xmm11 +paddq %xmm11, %xmm12 +movdqu 32(%rax), %xmm11 +movaps %xmm10, %xmm9 +psrlq $40, %xmm10 +pand %xmm0, %xmm8 +movaps %xmm11, %xmm15 +paddq %xmm14, %xmm4 +xorps %xmm14, %xmm14 +punpckldq %xmm13, %xmm15 +psrlq $14, %xmm9 +addq $64, %rax +pand %xmm0, %xmm9 +psrlq $26, %xmm3 +cmpq $63, %rcx +por 312(%rsp), %xmm10 +movaps %xmm13, -72(%rsp) +movaps %xmm15, %xmm13 +punpckldq %xmm14, %xmm13 +punpckhdq -72(%rsp), %xmm11 +movaps %xmm13, -56(%rsp) +movaps %xmm11, %xmm13 +punpckhdq %xmm14, %xmm11 +pand %xmm0, %xmm3 +psllq $18, %xmm11 +punpckhdq %xmm14, %xmm15 +punpckldq %xmm14, %xmm13 +paddq %xmm11, %xmm4 +movaps -8(%rsp), %xmm11 +psllq $6, %xmm15 +psllq $12, %xmm13 +movaps 88(%rsp), %xmm14 +paddq %xmm15, %xmm2 +pmuludq %xmm10, %xmm11 +paddq %xmm13, %xmm1 +movaps -8(%rsp), %xmm13 +pmuludq %xmm10, %xmm14 +paddq -56(%rsp), %xmm5 +paddq 312(%rsp), %xmm12 +pmuludq %xmm9, %xmm13 +movaps 104(%rsp), %xmm15 +paddq %xmm11, %xmm2 +movaps 184(%rsp), %xmm11 +paddq %xmm14, %xmm5 +movaps 104(%rsp), %xmm14 +pmuludq %xmm9, %xmm15 +pmuludq %xmm10, %xmm11 +paddq %xmm13, %xmm5 +movaps 104(%rsp), %xmm13 +pmuludq %xmm10, %xmm14 +pmuludq 232(%rsp), %xmm10 +paddq %xmm10, %xmm12 +pmuludq %xmm8, %xmm13 +paddq %xmm15, %xmm2 +movaps %xmm8, %xmm10 +paddq %xmm11, %xmm4 +pmuludq %xmm7, %xmm10 +movaps 232(%rsp), %xmm11 +movaps 184(%rsp), %xmm15 +paddq %xmm14, %xmm1 +pmuludq %xmm9, %xmm11 +paddq %xmm13, %xmm5 +movaps 184(%rsp), %xmm13 +movaps 184(%rsp), %xmm14 +pmuludq %xmm3, %xmm15 +pmuludq %xmm9, %xmm13 +paddq %xmm11, %xmm4 +pmuludq %xmm8, %xmm14 +movaps 232(%rsp), %xmm11 +paddq %xmm10, %xmm4 +paddq %xmm15, %xmm5 +pmuludq %xmm7, %xmm9 +pmuludq %xmm8, %xmm11 +paddq %xmm13, %xmm1 +movaps 232(%rsp), %xmm13 +movaps 296(%rsp), %xmm10 +paddq %xmm14, %xmm2 +pmuludq 296(%rsp), %xmm8 +movaps -40(%rsp), %xmm14 +pmuludq %xmm3, %xmm13 +paddq %xmm9, %xmm12 +paddq %xmm11, %xmm1 +movaps %xmm3, %xmm11 +paddq %xmm8, %xmm12 +movaps 232(%rsp), %xmm15 +pmuludq %xmm7, %xmm11 +pmuludq %xmm3, %xmm10 +paddq %xmm13, %xmm2 +movaps %xmm14, %xmm13 +movaps 296(%rsp), %xmm9 +pmuludq %xmm14, %xmm15 +pmuludq 264(%rsp), %xmm3 +paddq %xmm11, %xmm1 +pmuludq %xmm7, %xmm13 +paddq %xmm3, %xmm12 +movaps 264(%rsp), %xmm11 +paddq %xmm10, %xmm4 +pmuludq %xmm14, %xmm9 +paddq %xmm15, %xmm5 +pmuludq %xmm14, %xmm11 +movaps %xmm5, %xmm8 +paddq %xmm13, %xmm2 +psrlq $26, %xmm8 +paddq %xmm9, %xmm1 +pand %xmm0, %xmm5 +pmuludq 152(%rsp), %xmm14 +paddq %xmm14, %xmm12 +paddq %xmm8, %xmm2 +paddq %xmm11, %xmm4 +movaps %xmm2, %xmm9 +movaps %xmm2, %xmm8 +movaps %xmm4, %xmm3 +psrlq $26, %xmm9 +pand %xmm0, %xmm4 +psrlq $26, %xmm3 +paddq %xmm9, %xmm1 +pand %xmm0, %xmm8 +paddq %xmm3, %xmm12 +movaps %xmm1, %xmm10 +movaps %xmm1, %xmm9 +movaps %xmm12, %xmm3 +psrlq $26, %xmm10 +pand %xmm0, %xmm12 +psrlq $26, %xmm3 +paddq %xmm10, %xmm4 +pand %xmm0, %xmm9 +pmuludq %xmm6, %xmm3 +movaps %xmm4, %xmm1 +movaps %xmm4, %xmm15 +psrlq $26, %xmm1 +pand %xmm0, %xmm15 +paddq %xmm1, %xmm12 +paddq %xmm3, %xmm5 +movaps %xmm5, %xmm2 +movaps %xmm5, %xmm3 +psrlq $26, %xmm2 +pand %xmm0, %xmm3 +paddq %xmm2, %xmm8 +ja poly1305_blocks_sse2_20 +leaq -64(%rdx), %rax +andl $63, %edx +andq $-64, %rax +leaq 64(%rsi,%rax), %rsi +poly1305_blocks_sse2_19: +cmpq $31, %rdx +jbe poly1305_blocks_sse2_21 +movaps 56(%rsp), %xmm11 +movaps %xmm15, %xmm1 +movaps %xmm15, %xmm14 +movaps 72(%rsp), %xmm5 +movaps %xmm12, %xmm4 +movaps %xmm15, %xmm10 +movaps 88(%rsp), %xmm2 +pmuludq %xmm11, %xmm14 +movaps %xmm8, %xmm15 +pmuludq %xmm5, %xmm1 +movaps 40(%rsp), %xmm13 +testq %rsi, %rsi +pmuludq %xmm12, %xmm2 +pmuludq %xmm12, %xmm5 +pmuludq %xmm11, %xmm4 +paddq %xmm1, %xmm2 +pmuludq %xmm9, %xmm11 +movaps %xmm12, %xmm1 +paddq %xmm14, %xmm5 +pmuludq %xmm13, %xmm15 +movaps %xmm9, %xmm14 +pmuludq %xmm13, %xmm14 +pmuludq %xmm13, %xmm1 +paddq %xmm11, %xmm2 +movaps 168(%rsp), %xmm11 +pmuludq %xmm10, %xmm13 +paddq %xmm15, %xmm2 +movaps %xmm9, %xmm15 +paddq %xmm14, %xmm5 +pmuludq %xmm11, %xmm12 +movaps %xmm3, %xmm14 +pmuludq %xmm11, %xmm14 +movaps %xmm13, 248(%rsp) +movaps %xmm10, %xmm13 +pmuludq %xmm7, %xmm15 +paddq 248(%rsp), %xmm4 +pmuludq %xmm11, %xmm13 +pmuludq %xmm7, %xmm10 +paddq %xmm14, %xmm2 +movaps %xmm13, 280(%rsp) +movaps %xmm8, %xmm13 +pmuludq %xmm11, %xmm13 +paddq %xmm10, %xmm12 +movaps 296(%rsp), %xmm10 +paddq 280(%rsp), %xmm1 +pmuludq %xmm9, %xmm11 +pmuludq 296(%rsp), %xmm9 +pmuludq %xmm3, %xmm10 +paddq %xmm9, %xmm12 +paddq %xmm13, %xmm5 +movaps %xmm3, %xmm13 +paddq %xmm15, %xmm1 +pmuludq %xmm7, %xmm13 +paddq %xmm11, %xmm4 +movaps 296(%rsp), %xmm11 +pmuludq %xmm8, %xmm7 +pmuludq %xmm8, %xmm11 +pmuludq 264(%rsp), %xmm8 +paddq %xmm8, %xmm12 +paddq %xmm13, %xmm5 +paddq %xmm7, %xmm4 +movaps 264(%rsp), %xmm7 +paddq %xmm11, %xmm1 +paddq %xmm10, %xmm4 +pmuludq %xmm3, %xmm7 +pmuludq 152(%rsp), %xmm3 +paddq %xmm3, %xmm12 +paddq %xmm7, %xmm1 +je poly1305_blocks_sse2_22 +movdqu (%rsi), %xmm7 +xorps %xmm3, %xmm3 +paddq 312(%rsp), %xmm12 +movdqu 16(%rsi), %xmm8 +movaps %xmm7, %xmm9 +punpckldq %xmm8, %xmm9 +punpckhdq %xmm8, %xmm7 +movaps %xmm9, %xmm10 +movaps %xmm7, %xmm8 +punpckldq %xmm3, %xmm10 +punpckhdq %xmm3, %xmm9 +punpckhdq %xmm3, %xmm7 +punpckldq %xmm3, %xmm8 +movaps %xmm8, %xmm3 +psllq $6, %xmm9 +paddq %xmm10, %xmm2 +psllq $12, %xmm3 +paddq %xmm9, %xmm5 +psllq $18, %xmm7 +paddq %xmm3, %xmm4 +paddq %xmm7, %xmm1 +poly1305_blocks_sse2_22: +movaps %xmm2, %xmm8 +movaps %xmm1, %xmm3 +movaps %xmm1, %xmm15 +psrlq $26, %xmm8 +pand %xmm0, %xmm2 +pand %xmm0, %xmm15 +psrlq $26, %xmm3 +paddq %xmm5, %xmm8 +paddq %xmm12, %xmm3 +movaps %xmm8, %xmm9 +pand %xmm0, %xmm8 +movaps %xmm3, %xmm1 +psrlq $26, %xmm9 +movaps %xmm3, %xmm12 +psrlq $26, %xmm1 +paddq %xmm4, %xmm9 +pand %xmm0, %xmm12 +pmuludq %xmm1, %xmm6 +movaps %xmm9, %xmm3 +pand %xmm0, %xmm9 +psrlq $26, %xmm3 +paddq %xmm3, %xmm15 +paddq %xmm6, %xmm2 +movaps %xmm15, %xmm3 +pand %xmm0, %xmm15 +movaps %xmm2, %xmm1 +psrlq $26, %xmm3 +psrlq $26, %xmm1 +paddq %xmm3, %xmm12 +movaps %xmm0, %xmm3 +paddq %xmm1, %xmm8 +pand %xmm2, %xmm3 +poly1305_blocks_sse2_21: +testq %rsi, %rsi +je poly1305_blocks_sse2_23 +pshufd $8, %xmm3, %xmm3 +pshufd $8, %xmm8, %xmm8 +pshufd $8, %xmm9, %xmm9 +pshufd $8, %xmm15, %xmm15 +pshufd $8, %xmm12, %xmm12 +punpcklqdq %xmm8, %xmm3 +punpcklqdq %xmm15, %xmm9 +movdqu %xmm3, (%rdi) +movdqu %xmm9, 16(%rdi) +movq %xmm12, 32(%rdi) +jmp poly1305_blocks_sse2_10 +poly1305_blocks_sse2_23: +movaps %xmm3, %xmm0 +movaps %xmm8, %xmm4 +movaps %xmm9, %xmm2 +psrldq $8, %xmm0 +movaps %xmm15, %xmm10 +paddq %xmm0, %xmm3 +psrldq $8, %xmm4 +movaps %xmm12, %xmm0 +movd %xmm3, %edx +paddq %xmm4, %xmm8 +psrldq $8, %xmm2 +movl %edx, %ecx +movd %xmm8, %eax +paddq %xmm2, %xmm9 +shrl $26, %ecx +psrldq $8, %xmm10 +andl $67108863, %edx +addl %ecx, %eax +movd %xmm9, %ecx +paddq %xmm10, %xmm15 +movl %eax, %r9d +shrl $26, %eax +psrldq $8, %xmm0 +addl %ecx, %eax +movd %xmm15, %ecx +paddq %xmm0, %xmm12 +movl %eax, %esi +andl $67108863, %r9d +movd %xmm12, %r10d +shrl $26, %esi +andl $67108863, %eax +addl %ecx, %esi +salq $8, %rax +movl %r9d, %ecx +shrl $18, %r9d +movl %esi, %r8d +shrl $26, %esi +andl $67108863, %r8d +addl %r10d, %esi +orq %r9, %rax +salq $16, %rsi +movq %r8, %r9 +shrl $10, %r8d +salq $26, %rcx +orq %r8, %rsi +salq $34, %r9 +orq %rdx, %rcx +movq %rsi, %r11 +shrq $42, %rsi +movabsq $17592186044415, %rdx +orq %r9, %rax +movabsq $4398046511103, %r8 +andq %rdx, %rcx +andq %rdx, %rax +andq %r8, %r11 +leaq (%rsi,%rsi,4), %rsi +addq %rsi, %rcx +movq %rcx, %r10 +shrq $44, %rcx +addq %rcx, %rax +andq %rdx, %r10 +movq %rax, %r9 +shrq $44, %rax +addq %r11, %rax +andq %rdx, %r9 +movabsq $-4398046511104, %r11 +movq %rax, %rcx +andq %r8, %rcx +shrq $42, %rax +leaq (%rax,%rax,4), %rsi +addq %rcx, %r11 +addq %r10, %rsi +movq %rsi, %r8 +shrq $44, %rsi +andq %rdx, %r8 +addq %r9, %rsi +leaq 5(%r8), %r9 +movq %r9, %rbx +andq %rdx, %r9 +shrq $44, %rbx +addq %rsi, %rbx +movq %rbx, %rax +andq %rbx, %rdx +shrq $44, %rax +addq %rax, %r11 +movq %r11, %rax +shrq $63, %rax +decq %rax +movq %rax, %r10 +andq %rax, %r9 +andq %rax, %rdx +notq %r10 +andq %r11, %rax +andq %r10, %r8 +andq %r10, %rsi +andq %r10, %rcx +orq %r9, %r8 +orq %rdx, %rsi +orq %rax, %rcx +movq %r8, (%rdi) +movq %rsi, 8(%rdi) +movq %rcx, 16(%rdi) +poly1305_blocks_sse2_10: +movq -8(%rbp), %rbx +leave +ret +FN_END poly1305_blocks_sse2 + +GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2 +poly1305_finish_ext_sse2_local: +pushq %r12 +movq %rcx, %r12 +pushq %rbp +movq %rdx, %rbp +pushq %rbx +movq %rdi, %rbx +subq $32, %rsp +testq %rdx, %rdx +je poly1305_finish_ext_sse2_27 +xorl %eax, %eax +movq %rsp, %rdi +movl $8, %ecx +rep stosl +subq %rsp, %rsi +testb $16, %dl +movq %rsp, %rax +je poly1305_finish_ext_sse2_28 +movdqu (%rsp,%rsi), %xmm0 +addq $16, %rax +movaps %xmm0, (%rsp) +poly1305_finish_ext_sse2_28: +testb $8, %bpl +je poly1305_finish_ext_sse2_29 +movq (%rax,%rsi), %rdx +movq %rdx, (%rax) +addq $8, %rax +poly1305_finish_ext_sse2_29: +testb $4, %bpl +je poly1305_finish_ext_sse2_30 +movl (%rax,%rsi), %edx +movl %edx, (%rax) +addq $4, %rax +poly1305_finish_ext_sse2_30: +testb $2, %bpl +je poly1305_finish_ext_sse2_31 +movw (%rax,%rsi), %dx +movw %dx, (%rax) +addq $2, %rax +poly1305_finish_ext_sse2_31: +testb $1, %bpl +je poly1305_finish_ext_sse2_32 +movb (%rax,%rsi), %dl +movb %dl, (%rax) +poly1305_finish_ext_sse2_32: +cmpq $16, %rbp +je poly1305_finish_ext_sse2_33 +movb $1, (%rsp,%rbp) +poly1305_finish_ext_sse2_33: +cmpq $16, %rbp +movl $32, %edx +movq %rsp, %rsi +sbbq %rax, %rax +movq %rbx, %rdi +andl $4, %eax +addq $4, %rax +orq %rax, 120(%rbx) +call poly1305_blocks_sse2_local +poly1305_finish_ext_sse2_27: +movq 120(%rbx), %rax +testb $1, %al +je poly1305_finish_ext_sse2_35 +decq %rbp +cmpq $15, %rbp +jbe poly1305_finish_ext_sse2_36 +orq $16, %rax +jmp poly1305_finish_ext_sse2_40 +poly1305_finish_ext_sse2_36: +orq $32, %rax +poly1305_finish_ext_sse2_40: +movq %rax, 120(%rbx) +movl $32, %edx +xorl %esi, %esi +movq %rbx, %rdi +call poly1305_blocks_sse2_local +poly1305_finish_ext_sse2_35: +movq 8(%rbx), %rax +movq 112(%rbx), %rsi +movq %rax, %rdx +movq %rax, %rcx +movq 16(%rbx), %rax +shrq $20, %rcx +salq $44, %rdx +orq (%rbx), %rdx +salq $24, %rax +orq %rcx, %rax +movq 104(%rbx), %rcx +addq %rcx, %rdx +adcq %rsi, %rax +xorps %xmm0, %xmm0 +movdqu %xmm0, (%rbx) +movdqu %xmm0, 16(%rbx) +movdqu %xmm0, 32(%rbx) +movdqu %xmm0, 48(%rbx) +movdqu %xmm0, 64(%rbx) +movdqu %xmm0, 80(%rbx) +movdqu %xmm0, 96(%rbx) +movdqu %xmm0, 112(%rbx) +movq %rdx, (%r12) +movq %rax, 8(%r12) +addq $32, %rsp +popq %rbx +popq %rbp +popq %r12 +ret +FN_END poly1305_finish_ext_sse2 + +GLOBAL_HIDDEN_FN poly1305_auth_sse2 +cmpq $128, %rdx +jb poly1305_auth_x86_local +pushq %rbp +movq %rsp, %rbp +pushq %r14 +pushq %r13 +movq %rdi, %r13 +pushq %r12 +movq %rsi, %r12 +movq %rcx, %rsi +pushq %rbx +movq %rdx, %rbx +andq $-64, %rsp +movq %rbx, %r14 +addq $-128, %rsp +movq %rsp, %rdi +call poly1305_init_ext_sse2_local +andq $-32, %r14 +je poly1305_auth_sse2_42 +movq %r12, %rsi +movq %r14, %rdx +movq %rsp, %rdi +call poly1305_blocks_sse2_local +addq %r14, %r12 +subq %r14, %rbx +poly1305_auth_sse2_42: +movq %r13, %rcx +movq %rbx, %rdx +movq %r12, %rsi +movq %rsp, %rdi +call poly1305_finish_ext_sse2_local +leaq -32(%rbp), %rsp +popq %rbx +popq %r12 +popq %r13 +popq %r14 +popq %rbp +ret +FN_END poly1305_auth_sse2 + + + + + |