TARGET_ARCHITECTURE(ARCH)
SET(CHACHASRC chacha20/chacha.c chacha20/ref.c)
-SET(POLYSRC poly1305/poly1305-donna.c)
+SET(POLYSRC poly1305/poly1305.c)
# For now we support only x86_64 architecture with optimizations
IF(${ARCH} STREQUAL "x86_64")
ASM_OP(HAVE_AVX2 "vpaddq %ymm0, %ymm0, %ymm0" "avx2")
ASM_OP(HAVE_AVX "vpaddq %xmm0, %xmm0, %xmm0" "avx")
ASM_OP(HAVE_SSE2 "pmuludq %xmm0, %xmm0" "sse2")
+
ASM_OP(HAVE_SLASHMACRO "
.macro TEST1 op
\\op %eax, %eax
.endm
TEST1 xorl
" "slash macro convention")
+
ASM_OP(HAVE_DOLLARMACRO "
.macro TEST1 op
$0 %eax, %eax
.endm
TEST1 xorl
" "dollar macro convention")
+
CONFIGURE_FILE(platform_config.h.in platform_config.h)
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
SET(CURVESRC curve25519/curve25519-donna-c64.c)
+ SET(POLYSRC ${POLYSRC} poly1305/ref-64.c)
ELSEIF(${ARCH} STREQUAL "i386")
+ SET(POLYSRC ${POLYSRC} poly1305/ref-32.c)
SET(CURVESRC curve25519/curve25519-donna.c)
ELSE()
SET(CURVESRC curve25519/ref.c)
+ SET(POLYSRC ${POLYSRC} poly1305/ref-32.c)
ENDIF()
IF(HAVE_AVX2)
SET(CHACHASRC ${CHACHASRC} chacha20/avx2.S)
+ SET(POLYSRC ${POLYSRC} poly1305/avx2.S)
ENDIF(HAVE_AVX2)
IF(HAVE_AVX)
SET(CHACHASRC ${CHACHASRC} chacha20/avx.S)
+ SET(POLYSRC ${POLYSRC} poly1305/avx.S)
ENDIF(HAVE_AVX)
IF(HAVE_SSE2)
SET(CHACHASRC ${CHACHASRC} chacha20/sse2.S)
+ SET(POLYSRC ${POLYSRC} poly1305/sse2.S)
ENDIF(HAVE_SSE2)
SET(LIBCRYPTOBOXSRC cryptobox.c)
#include "macro.S"
+#include "constants.S"
SECTION_TEXT
GLOBAL_HIDDEN_FN chacha_blocks_avx
#include "macro.S"
+#include "constants.S"
SECTION_TEXT
GLOBAL_HIDDEN_FN chacha_blocks_avx2
/* Copyright (c) 2015, Vsevolod Stakhov
+ * Copyright (c) 2015, Andrew Moon
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
--- /dev/null
+SECTION_RODATA
+.p2align 4,,15
+chacha_constants:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */
#endif
#endif
.endm
-
-SECTION_RODATA
-.p2align 4,,15
-chacha_constants:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */
#include "macro.S"
-
+#include "constants.S"
SECTION_TEXT
GLOBAL_HIDDEN_FN chacha_blocks_sse2
#include "cryptobox.h"
#include "platform_config.h"
#include "chacha20/chacha.h"
-#include "poly1305/poly1305-donna.h"
+#include "poly1305/poly1305.h"
#include "curve25519/curve25519.h"
#include "ottery.h"
}
chacha_load ();
+ poly1305_load ();
}
void
const rspamd_nonce_t nonce,
const rspamd_nm_t nm, rspamd_sig_t sig)
{
- poly1305_context mac_ctx;
+ poly1305_state mac_ctx;
guchar subkey[CHACHA_BLOCKBYTES];
chacha_state s;
gsize r;
r = chacha_update (&s, data, data, len);
chacha_final (&s, data + r);
- poly1305_init (&mac_ctx, subkey);
+ poly1305_init (&mac_ctx, (const poly1305_key *)subkey);
poly1305_update (&mac_ctx, data, len);
poly1305_finish (&mac_ctx, sig);
rspamd_cryptobox_decrypt_nm_inplace (guchar *data, gsize len,
const rspamd_nonce_t nonce, const rspamd_nm_t nm, const rspamd_sig_t sig)
{
- poly1305_context mac_ctx;
+ poly1305_state mac_ctx;
guchar subkey[CHACHA_BLOCKBYTES];
rspamd_sig_t mac;
chacha_state s;
memset (subkey, 0, sizeof (subkey));
chacha_update (&s, subkey, subkey, sizeof (subkey));
- poly1305_init (&mac_ctx, subkey);
+ poly1305_init (&mac_ctx, (const poly1305_key *)subkey);
poly1305_update (&mac_ctx, data, len);
poly1305_finish (&mac_ctx, mac);
+++ /dev/null
-"A state-of-the-art message-authentication code"\r
-\r
-# ABOUT\r
-\r
-See: [http://cr.yp.to/mac.html](http://cr.yp.to/mac.html) and [http://cr.yp.to/mac/poly1305-20050329.pdf](http://cr.yp.to/mac/poly1305-20050329.pdf)\r
-\r
-These are quite portable implementations of increasing efficiency depending on the size of the multiplier available.\r
-Optimized implementations have been moved to [poly1305-opt](https://github.com/floodyberry/poly1305-opt)\r
-\r
-# BUILDING\r
-\r
-## Default\r
-\r
-If compiled with no options, `poly1305-donna.c` will select between the 32 bit and 64 bit implementations based\r
-on what it can tell the compiler supports\r
-\r
- gcc poly1305-donna.c -O3 -o poly1305.o\r
-\r
-## Selecting a specific version\r
-\r
- gcc poly1305-donna.c -O3 -o poly1305.o -DPOLY1305_XXBITS\r
-\r
-Where `-DPOLY1305_XXBITS` is one of\r
-\r
- * `-DPOLY1305_8BITS`, 8->16 bit multiplies, 32 bit additions\r
- * `-DPOLY1305_16BITS`, 16->32 bit multiples, 32 bit additions\r
- * `-DPOLY1305_32BITS`, 32->64 bit multiplies, 64 bit additions\r
- * `-DPOLY1305_64BITS`, 64->128 bit multiplies, 128 bit additions\r
-\r
-8 bit and 16 bit versions were written to keep the code size small, 32 bit and 64 bit versions are mildly optimized due\r
-to needing fewer multiplications. All 4 can be made faster at the expense of increased code size and complexity, which \r
-is not the intention of this project.\r
-\r
-# USAGE\r
-\r
-See: [http://nacl.cace-project.eu/onetimeauth.html](http://nacl.cace-project.eu/onetimeauth.html), in specific, slightly plagiarized:\r
-\r
-The poly1305_auth function, viewed as a function of the message for a uniform random key, is \r
-designed to meet the standard notion of unforgeability after a single message. After the sender \r
-authenticates one message, an attacker cannot find authenticators for any other messages.\r
-\r
-The sender **MUST NOT** use poly1305_auth to authenticate more than one message under the same key.\r
-Authenticators for two messages under the same key should be expected to reveal enough information \r
-to allow forgeries of authenticators on other messages. \r
-\r
-## Functions\r
-\r
-`poly1305_context` is declared in [poly1305.h](poly1305.h) and is an opaque structure large enough to support \r
-every underlying platform specific implementation. It should be size_t aligned, which should be handled already\r
-with the size_t member `aligner`.\r
-\r
-`void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);`\r
-\r
-where\r
-\r
-`key` is the 32 byte key that is **only used for this message and is discarded immediately after**\r
-\r
-`void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);`\r
-\r
-where `m` is a pointer to the message fragment to be processed, and\r
-\r
-`bytes` is the length of the message fragment\r
-\r
-`void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);`\r
-\r
-where `mac` is the buffer which receives the 16 byte authenticator. After calling finish, the underlying\r
-implementation will zero out `ctx`.\r
-\r
-`void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);`\r
-\r
-where `mac` is the buffer which receives the 16 byte authenticator,\r
-\r
-`m` is a pointer to the message to be processed,\r
-\r
-`bytes` is the number of bytes in the message, and\r
-\r
-`key` is the 32 byte key that is **only used for this message and is discarded immediately after**.\r
-\r
-`int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);`\r
-\r
-where `mac1` is compared to `mac2` in constant time and returns `1` if they are equal and `0` if they are not\r
-\r
-`int poly1305_power_on_self_test(void);`\r
-\r
-tests the underlying implementation to verify it is working correctly. It returns `1` if all tests pass, and `0` if \r
-any tests fail.\r
-\r
-## Example\r
-\r
-### Simple\r
-\r
- #include "poly1305-donna.h"\r
-\r
- unsigned char key[32] = {...}, mac[16];\r
- unsigned char msg[] = {...};\r
-\r
- poly1305_auth(mac, msg, msglen, key);\r
-\r
-### Full\r
-\r
-[example-poly1305.c](example-poly1305.c) is a simple example of how to verify the underlying implementation is producing\r
-the correct results, compute an authenticator, and test it against an expected value.\r
-\r
-# LICENSE\r
-\r
-[MIT](http://www.opensource.org/licenses/mit-license.php) or PUBLIC DOMAIN\r
-\r
-\r
-# NAMESAKE\r
-\r
-I borrowed the idea for these from Adam Langley's [curve25519-donna](http://github.com/agl/curve25519-donna), hence\r
-the name.
\ No newline at end of file
--- /dev/null
+#include "../chacha20/macro.S"
+#include "constants.S"
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0
+movl $32, %eax
+ret
+FN_END poly1305_block_size_avx
+
+GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1
+poly1305_init_ext_avx_local:
+pushq %r15
+pushq %r14
+pushq %r13
+pushq %r12
+pushq %rbp
+pushq %rbx
+movq %rdi, %rbp
+testq %rdx, %rdx
+movq $-1, %rax
+cmovne %rdx, %rax
+movq %rax, -16(%rsp)
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqu %xmm0, (%rdi)
+vmovdqu %xmm0, 16(%rdi)
+vmovdqu %xmm0, 32(%rdi)
+movq (%rsi), %r9
+movq 8(%rsi), %r8
+movabsq $17575274610687, %r10
+andq %r9, %r10
+shrq $44, %r9
+movq %r8, %rax
+salq $20, %rax
+orq %rax, %r9
+movabsq $17592181915647, %rax
+andq %rax, %r9
+shrq $24, %r8
+movabsq $68719475727, %rax
+andq %rax, %r8
+leaq 40(%rdi), %r15
+movl %r10d, %eax
+andl $67108863, %eax
+movl %eax, 40(%rdi)
+movl %r9d, %edx
+sall $18, %edx
+movq %r10, %rax
+shrq $26, %rax
+orl %edx, %eax
+andl $67108863, %eax
+movl %eax, 44(%rdi)
+movq %r9, %rax
+shrq $8, %rax
+andl $67108863, %eax
+movl %eax, 48(%rdi)
+movq %r9, %rax
+shrq $34, %rax
+movl %r8d, %edx
+sall $10, %edx
+orl %edx, %eax
+andl $67108863, %eax
+movl %eax, 52(%rdi)
+movq %r8, %rax
+shrq $16, %rax
+movl %eax, 56(%rdi)
+movq 16(%rsi), %rax
+movq %rax, 104(%rdi)
+movq 24(%rsi), %rax
+movq %rax, 112(%rdi)
+movl $0, %ebx
+.L7:
+testq %rbx, %rbx
+jne .L4
+leaq 60(%rbp), %r15
+cmpq $16, -16(%rsp)
+ja .L6
+jmp .L5
+.L4:
+cmpq $1, %rbx
+jne .L6
+leaq 80(%rbp), %r15
+cmpq $95, -16(%rsp)
+jbe .L5
+.L6:
+leaq (%r8,%r8,4), %rsi
+salq $2, %rsi
+leaq (%r9,%r9), %rdi
+movq %rdi, %rax
+mulq %rsi
+movq %rax, %r13
+movq %rdx, %r14
+movq %r10, %rax
+mulq %r10
+addq %r13, %rax
+adcq %r14, %rdx
+movabsq $17592186044415, %rcx
+movq %rax, -72(%rsp)
+movq %rdx, -64(%rsp)
+andq -72(%rsp), %rcx
+leaq (%r10,%r10), %r11
+movq %r11, %rax
+mulq %r9
+movq %rax, %r11
+movq %rdx, %r12
+movq %rsi, %rax
+mulq %r8
+movq %rax, %r13
+movq %rdx, %r14
+addq %r11, %r13
+adcq %r12, %r14
+movq -72(%rsp), %rax
+movq -64(%rsp), %rdx
+shrdq $44, %rdx, %rax
+movq %rax, -56(%rsp)
+movq $0, -48(%rsp)
+addq -56(%rsp), %r13
+adcq -48(%rsp), %r14
+movabsq $17592186044415, %rsi
+andq %r13, %rsi
+leaq (%r8,%r8), %rdi
+movq %rdi, %rax
+mulq %r10
+movq %rax, %r11
+movq %rdx, %r12
+movq %r9, %rax
+mulq %r9
+addq %r11, %rax
+adcq %r12, %rdx
+shrdq $44, %r14, %r13
+movq %r13, -40(%rsp)
+movq $0, -32(%rsp)
+addq -40(%rsp), %rax
+adcq -32(%rsp), %rdx
+movabsq $4398046511103, %rdi
+andq %rax, %rdi
+shrdq $42, %rdx, %rax
+leaq (%rax,%rax,4), %r8
+addq %rcx, %r8
+movabsq $17592186044415, %r10
+andq %r8, %r10
+shrq $44, %r8
+addq %rsi, %r8
+movabsq $17592186044415, %r9
+andq %r8, %r9
+shrq $44, %r8
+addq %rdi, %r8
+movl %r10d, %eax
+andl $67108863, %eax
+movl %eax, (%r15)
+movl %r9d, %edx
+sall $18, %edx
+movq %r10, %rax
+shrq $26, %rax
+orl %edx, %eax
+andl $67108863, %eax
+movl %eax, 4(%r15)
+movq %r9, %rax
+shrq $8, %rax
+andl $67108863, %eax
+movl %eax, 8(%r15)
+movl %r8d, %edx
+sall $10, %edx
+movq %r9, %rax
+shrq $34, %rax
+orl %edx, %eax
+andl $67108863, %eax
+movl %eax, 12(%r15)
+movq %r8, %rax
+shrq $16, %rax
+movl %eax, 16(%r15)
+addq $1, %rbx
+cmpq $2, %rbx
+jne .L7
+.L5:
+movq $0, 120(%rbp)
+popq %rbx
+popq %rbp
+popq %r12
+popq %r13
+popq %r14
+popq %r15
+ret
+FN_END poly1305_init_ext_avx
+
+
+
+GLOBAL_HIDDEN_FN poly1305_blocks_avx
+poly1305_blocks_avx_local:
+pushq %rbp
+movq %rsp, %rbp
+pushq %rbx
+andq $-64, %rsp
+subq $200, %rsp
+movl $(1 << 24), %eax
+movl $((1 << 26) - 1), %r8d
+movl $(5), %r9d
+vmovd %eax, %xmm1
+vmovd %r8d, %xmm0
+vmovd %r9d, %xmm2
+vpshufd $68, %xmm1, %xmm1
+vpshufd $68, %xmm0, %xmm0
+vpshufd $68, %xmm2, %xmm2
+vmovdqa %xmm1, 152(%rsp)
+vmovdqa %xmm2, 184(%rsp)
+movq 120(%rdi), %rax
+testb $4, %al
+je .L12
+vpsrldq $8, %xmm1, %xmm1
+vmovdqa %xmm1, 152(%rsp)
+.L12:
+testb $8, %al
+je .L13
+vpxor %xmm1, %xmm1, %xmm1
+vmovdqa %xmm1, 152(%rsp)
+.L13:
+testb $1, %al
+jne .L14
+vmovq (%rsi), %xmm1
+vpinsrq $1, 16(%rsi), %xmm1, %xmm1
+vmovq 8(%rsi), %xmm3
+vpinsrq $1, 24(%rsi), %xmm3, %xmm2
+vpand %xmm0, %xmm1, %xmm7
+vpsrlq $26, %xmm1, %xmm12
+vpand %xmm0, %xmm12, %xmm12
+vpsllq $12, %xmm2, %xmm3
+vpsrlq $52, %xmm1, %xmm1
+vpor %xmm3, %xmm1, %xmm6
+vpand %xmm0, %xmm6, %xmm3
+vpsrlq $26, %xmm6, %xmm6
+vpand %xmm0, %xmm6, %xmm6
+vpsrlq $40, %xmm2, %xmm2
+vpor 152(%rsp), %xmm2, %xmm2
+addq $32, %rsi
+subq $32, %rdx
+orq $1, %rax
+movq %rax, 120(%rdi)
+jmp .L15
+.L14:
+vmovdqu (%rdi), %xmm12
+vmovdqu 16(%rdi), %xmm6
+vmovdqu 32(%rdi), %xmm2
+vpshufd $80, %xmm12, %xmm7
+vpshufd $250, %xmm12, %xmm12
+vpshufd $80, %xmm6, %xmm3
+vpshufd $250, %xmm6, %xmm6
+vpshufd $80, %xmm2, %xmm2
+.L15:
+movq 120(%rdi), %rax
+testb $48, %al
+je .L16
+testb $16, %al
+je .L17
+vmovdqu 40(%rdi), %xmm1
+vmovd 56(%rdi), %xmm4
+vmovdqu 60(%rdi), %xmm5
+vpunpckldq %xmm1, %xmm5, %xmm11
+vpunpckhdq %xmm1, %xmm5, %xmm5
+vmovd 76(%rdi), %xmm1
+vpunpcklqdq %xmm4, %xmm1, %xmm4
+jmp .L18
+.L17:
+movl $(1), %r8d
+vmovdqu 40(%rdi), %xmm5
+vmovd 56(%rdi), %xmm4
+vmovd %r8d, %xmm1
+vpunpckldq %xmm1, %xmm5, %xmm11
+vpunpckhdq %xmm1, %xmm5, %xmm5
+.L18:
+vpshufd $80, %xmm11, %xmm1
+vpshufd $250, %xmm11, %xmm11
+vpshufd $80, %xmm5, %xmm10
+vpshufd $250, %xmm5, %xmm5
+jmp .L19
+.L16:
+vmovdqu 60(%rdi), %xmm5
+vpshufd $0, %xmm5, %xmm1
+vpshufd $85, %xmm5, %xmm11
+vpshufd $170, %xmm5, %xmm10
+vpshufd $255, %xmm5, %xmm5
+vmovd 76(%rdi), %xmm4
+vpshufd $0, %xmm4, %xmm4
+.L19:
+vmovdqa %xmm11, 136(%rsp)
+vpmuludq 184(%rsp), %xmm11, %xmm13
+vmovdqa %xmm13, 120(%rsp)
+vmovdqa %xmm10, 104(%rsp)
+vpmuludq 184(%rsp), %xmm10, %xmm13
+vmovdqa %xmm13, 88(%rsp)
+vmovdqa %xmm5, 72(%rsp)
+vpmuludq 184(%rsp), %xmm5, %xmm5
+vmovdqa %xmm5, 56(%rsp)
+vmovdqa %xmm4, 40(%rsp)
+vpmuludq 184(%rsp), %xmm4, %xmm4
+vmovdqa %xmm4, 24(%rsp)
+cmpq $63, %rdx
+jbe .L20
+vmovdqu 80(%rdi), %xmm4
+vpshufd $0, %xmm4, %xmm5
+vmovdqa %xmm5, 8(%rsp)
+vpshufd $85, %xmm4, %xmm5
+vmovdqa %xmm5, -8(%rsp)
+vpshufd $170, %xmm4, %xmm13
+vmovdqa %xmm13, -24(%rsp)
+vpshufd $255, %xmm4, %xmm4
+vmovdqa %xmm4, %xmm10
+vmovdqa %xmm4, -40(%rsp)
+vmovd 96(%rdi), %xmm4
+vpshufd $0, %xmm4, %xmm4
+vmovdqa %xmm4, %xmm8
+vmovdqa %xmm4, -56(%rsp)
+vpmuludq 184(%rsp), %xmm5, %xmm4
+vmovdqa %xmm4, -72(%rsp)
+vpmuludq 184(%rsp), %xmm13, %xmm4
+vmovdqa %xmm4, -88(%rsp)
+vpmuludq 184(%rsp), %xmm10, %xmm4
+vmovdqa %xmm4, -104(%rsp)
+vpmuludq 184(%rsp), %xmm8, %xmm4
+vmovdqa %xmm4, -120(%rsp)
+leaq 32(%rsi), %rax
+movq %rdx, %rcx
+vmovdqa %xmm1, 168(%rsp)
+jmp .L22
+.p2align 6
+nop
+nop
+nop
+nop
+.L22:
+vpmuludq -72(%rsp), %xmm2, %xmm13
+vmovdqa -88(%rsp), %xmm5
+vpmuludq %xmm5, %xmm6, %xmm4
+vpmuludq %xmm5, %xmm2, %xmm11
+vmovdqa -104(%rsp), %xmm9
+vpmuludq %xmm9, %xmm6, %xmm5
+vpmuludq %xmm9, %xmm2, %xmm10
+vpaddq %xmm4, %xmm13, %xmm13
+vpmuludq %xmm9, %xmm3, %xmm4
+vmovdqa -120(%rsp), %xmm8
+vpmuludq %xmm8, %xmm2, %xmm9
+vpaddq %xmm5, %xmm11, %xmm11
+vmovdqa %xmm8, %xmm5
+vpmuludq %xmm8, %xmm12, %xmm8
+vpmuludq %xmm5, %xmm3, %xmm14
+vpaddq %xmm4, %xmm13, %xmm13
+vpmuludq %xmm5, %xmm6, %xmm4
+vmovdqa 8(%rsp), %xmm15
+vpmuludq %xmm15, %xmm6, %xmm5
+vpaddq %xmm8, %xmm13, %xmm13
+vpmuludq %xmm15, %xmm2, %xmm8
+vpaddq %xmm14, %xmm11, %xmm11
+vpmuludq %xmm15, %xmm7, %xmm14
+vpaddq %xmm4, %xmm10, %xmm10
+vpmuludq %xmm15, %xmm12, %xmm4
+vpaddq %xmm5, %xmm9, %xmm9
+vpmuludq %xmm15, %xmm3, %xmm5
+vmovdqa -8(%rsp), %xmm15
+vpmuludq %xmm15, %xmm3, %xmm2
+vpaddq %xmm14, %xmm13, %xmm13
+vpmuludq %xmm15, %xmm6, %xmm6
+vpaddq %xmm4, %xmm11, %xmm11
+vpmuludq %xmm15, %xmm7, %xmm4
+vpaddq %xmm5, %xmm10, %xmm10
+vmovq -32(%rax), %xmm5
+vpinsrq $1, -16(%rax), %xmm5, %xmm5
+vpmuludq %xmm15, %xmm12, %xmm14
+vpaddq %xmm2, %xmm9, %xmm9
+vmovdqa -24(%rsp), %xmm2
+vpmuludq %xmm2, %xmm12, %xmm15
+vpaddq %xmm6, %xmm8, %xmm8
+vpmuludq %xmm2, %xmm3, %xmm3
+vpaddq %xmm4, %xmm11, %xmm11
+vmovq -24(%rax), %xmm4
+vpinsrq $1, -8(%rax), %xmm4, %xmm6
+vpmuludq %xmm2, %xmm7, %xmm4
+vpaddq %xmm14, %xmm10, %xmm10
+vmovdqa -40(%rsp), %xmm1
+vpmuludq %xmm1, %xmm7, %xmm14
+vpaddq %xmm15, %xmm9, %xmm9
+vpand %xmm5, %xmm0, %xmm2
+vpmuludq %xmm1, %xmm12, %xmm12
+vpaddq %xmm3, %xmm8, %xmm8
+vpsrlq $26, %xmm5, %xmm3
+vpand %xmm3, %xmm0, %xmm3
+vpmuludq -56(%rsp), %xmm7, %xmm7
+vpaddq %xmm4, %xmm10, %xmm10
+vpsllq $12, %xmm6, %xmm15
+vpsrlq $52, %xmm5, %xmm4
+vpor %xmm15, %xmm4, %xmm4
+vpaddq %xmm14, %xmm9, %xmm9
+vpsrlq $14, %xmm6, %xmm5
+vpand %xmm5, %xmm0, %xmm5
+vpaddq %xmm12, %xmm8, %xmm8
+vpand %xmm4, %xmm0, %xmm4
+vpaddq %xmm7, %xmm8, %xmm8
+vpsrlq $40, %xmm6, %xmm6
+vpor 152(%rsp), %xmm6, %xmm6
+vmovdqu (%rax), %xmm12
+vmovdqu 16(%rax), %xmm7
+vpunpckldq %xmm7, %xmm12, %xmm15
+vpunpckhdq %xmm7, %xmm12, %xmm7
+vpxor %xmm14, %xmm14, %xmm14
+vpunpckldq %xmm14, %xmm15, %xmm12
+vpunpckhdq %xmm14, %xmm15, %xmm15
+vpunpckldq %xmm14, %xmm7, %xmm14
+vpxor %xmm1, %xmm1, %xmm1
+vpunpckhdq %xmm1, %xmm7, %xmm7
+vpsllq $6, %xmm15, %xmm15
+vpsllq $12, %xmm14, %xmm14
+vpsllq $18, %xmm7, %xmm7
+vpaddq %xmm12, %xmm13, %xmm12
+vpaddq %xmm15, %xmm11, %xmm15
+vpaddq %xmm14, %xmm10, %xmm14
+vpaddq %xmm7, %xmm9, %xmm7
+vpaddq 152(%rsp), %xmm8, %xmm8
+vpmuludq 120(%rsp), %xmm6, %xmm13
+vmovdqa 88(%rsp), %xmm10
+vpmuludq %xmm10, %xmm5, %xmm9
+vpmuludq %xmm10, %xmm6, %xmm11
+vmovdqa 56(%rsp), %xmm1
+vpmuludq %xmm1, %xmm5, %xmm10
+vpaddq %xmm13, %xmm12, %xmm12
+vpmuludq %xmm1, %xmm6, %xmm13
+vpaddq %xmm9, %xmm12, %xmm12
+vpmuludq %xmm1, %xmm4, %xmm9
+vpaddq %xmm11, %xmm15, %xmm15
+vmovdqa 24(%rsp), %xmm1
+vpmuludq %xmm1, %xmm6, %xmm11
+vpaddq %xmm10, %xmm15, %xmm10
+vpmuludq %xmm1, %xmm3, %xmm15
+vpaddq %xmm13, %xmm14, %xmm14
+vpmuludq %xmm1, %xmm4, %xmm13
+vpaddq %xmm9, %xmm12, %xmm9
+vpmuludq %xmm1, %xmm5, %xmm12
+vpaddq %xmm11, %xmm7, %xmm7
+vpmuludq 168(%rsp), %xmm5, %xmm11
+vpaddq %xmm15, %xmm9, %xmm9
+vpmuludq 168(%rsp), %xmm6, %xmm6
+vpaddq %xmm13, %xmm10, %xmm10
+vpmuludq 168(%rsp), %xmm2, %xmm15
+vpaddq %xmm12, %xmm14, %xmm14
+vpmuludq 168(%rsp), %xmm3, %xmm13
+vpaddq %xmm11, %xmm7, %xmm11
+vpmuludq 168(%rsp), %xmm4, %xmm12
+vpaddq %xmm6, %xmm8, %xmm6
+vmovdqa 136(%rsp), %xmm8
+vpmuludq %xmm8, %xmm4, %xmm7
+vpaddq %xmm15, %xmm9, %xmm9
+vpmuludq %xmm8, %xmm5, %xmm5
+vpaddq %xmm13, %xmm10, %xmm10
+vpmuludq %xmm8, %xmm2, %xmm15
+vpaddq %xmm12, %xmm14, %xmm14
+vpmuludq %xmm8, %xmm3, %xmm8
+vpaddq %xmm7, %xmm11, %xmm11
+vmovdqa 104(%rsp), %xmm7
+vpmuludq %xmm7, %xmm3, %xmm13
+vpaddq %xmm5, %xmm6, %xmm6
+vpmuludq %xmm7, %xmm4, %xmm4
+vpaddq %xmm15, %xmm10, %xmm10
+vpmuludq %xmm7, %xmm2, %xmm15
+vpaddq %xmm8, %xmm14, %xmm14
+vmovdqa 72(%rsp), %xmm5
+vpmuludq %xmm5, %xmm2, %xmm7
+vpaddq %xmm13, %xmm11, %xmm11
+vpmuludq %xmm5, %xmm3, %xmm3
+vpaddq %xmm4, %xmm6, %xmm6
+vpmuludq 40(%rsp), %xmm2, %xmm2
+vpaddq %xmm15, %xmm14, %xmm14
+vpaddq %xmm7, %xmm11, %xmm11
+vpaddq %xmm3, %xmm6, %xmm6
+vpaddq %xmm2, %xmm6, %xmm2
+vpsrlq $26, %xmm9, %xmm12
+vpsrlq $26, %xmm11, %xmm5
+vpand %xmm0, %xmm9, %xmm9
+vpand %xmm0, %xmm11, %xmm11
+vpaddq %xmm12, %xmm10, %xmm10
+vpaddq %xmm5, %xmm2, %xmm2
+vpsrlq $26, %xmm10, %xmm3
+vpsrlq $26, %xmm2, %xmm7
+vpand %xmm0, %xmm10, %xmm10
+vpand %xmm0, %xmm2, %xmm2
+vpaddq %xmm3, %xmm14, %xmm3
+vpmuludq 184(%rsp), %xmm7, %xmm7
+vpaddq %xmm7, %xmm9, %xmm9
+vpsrlq $26, %xmm3, %xmm6
+vpsrlq $26, %xmm9, %xmm12
+vpand %xmm0, %xmm3, %xmm3
+vpand %xmm0, %xmm9, %xmm7
+vpaddq %xmm6, %xmm11, %xmm6
+vpaddq %xmm12, %xmm10, %xmm12
+vpsrlq $26, %xmm6, %xmm8
+vpand %xmm0, %xmm6, %xmm6
+vpaddq %xmm8, %xmm2, %xmm2
+subq $64, %rcx
+addq $64, %rax
+cmpq $63, %rcx
+ja .L22
+vmovdqa 168(%rsp), %xmm1
+leaq -64(%rdx), %rax
+andq $-64, %rax
+leaq 64(%rsi,%rax), %rsi
+andl $63, %edx
+.L20:
+cmpq $31, %rdx
+jbe .L23
+vpmuludq 120(%rsp), %xmm2, %xmm11
+vmovdqa 88(%rsp), %xmm4
+vpmuludq %xmm4, %xmm6, %xmm0
+vpmuludq %xmm4, %xmm2, %xmm10
+vmovdqa 56(%rsp), %xmm4
+vpmuludq %xmm4, %xmm6, %xmm8
+vpmuludq %xmm4, %xmm2, %xmm5
+vpaddq %xmm0, %xmm11, %xmm11
+vpmuludq %xmm4, %xmm3, %xmm0
+vmovdqa 24(%rsp), %xmm13
+vpmuludq %xmm13, %xmm2, %xmm4
+vpaddq %xmm8, %xmm10, %xmm10
+vpmuludq %xmm13, %xmm12, %xmm8
+vpmuludq %xmm13, %xmm3, %xmm9
+vpaddq %xmm0, %xmm11, %xmm11
+vpmuludq %xmm13, %xmm6, %xmm13
+vpmuludq %xmm1, %xmm6, %xmm0
+vpaddq %xmm8, %xmm11, %xmm8
+vpmuludq %xmm1, %xmm2, %xmm2
+vpaddq %xmm9, %xmm10, %xmm9
+vpmuludq %xmm1, %xmm7, %xmm11
+vpaddq %xmm13, %xmm5, %xmm5
+vpmuludq %xmm1, %xmm12, %xmm10
+vpaddq %xmm0, %xmm4, %xmm0
+vpmuludq %xmm1, %xmm3, %xmm1
+vmovdqa 136(%rsp), %xmm4
+vpmuludq %xmm4, %xmm3, %xmm14
+vpaddq %xmm11, %xmm8, %xmm11
+vpmuludq %xmm4, %xmm6, %xmm6
+vpaddq %xmm10, %xmm9, %xmm9
+vpmuludq %xmm4, %xmm7, %xmm15
+vpaddq %xmm1, %xmm5, %xmm5
+vpmuludq %xmm4, %xmm12, %xmm1
+vpaddq %xmm14, %xmm0, %xmm0
+vmovdqa 104(%rsp), %xmm4
+vpmuludq %xmm4, %xmm12, %xmm8
+vpaddq %xmm6, %xmm2, %xmm2
+vpmuludq %xmm4, %xmm3, %xmm3
+vpaddq %xmm15, %xmm9, %xmm9
+vpmuludq %xmm4, %xmm7, %xmm10
+vpaddq %xmm1, %xmm5, %xmm1
+vmovdqa 72(%rsp), %xmm4
+vpmuludq %xmm4, %xmm7, %xmm15
+vpaddq %xmm8, %xmm0, %xmm0
+vpmuludq %xmm4, %xmm12, %xmm12
+vpaddq %xmm3, %xmm2, %xmm2
+vpmuludq 40(%rsp), %xmm7, %xmm7
+vpaddq %xmm10, %xmm1, %xmm1
+vpaddq %xmm15, %xmm0, %xmm0
+vpaddq %xmm12, %xmm2, %xmm2
+vpaddq %xmm7, %xmm2, %xmm2
+movl $((1 << 26) - 1), %r8d
+testq %rsi, %rsi
+vmovd %r8d, %xmm15
+je .L24
+vmovdqu (%rsi), %xmm4
+vmovdqu 16(%rsi), %xmm3
+vpunpckldq %xmm3, %xmm4, %xmm5
+vpunpckhdq %xmm3, %xmm4, %xmm3
+vpxor %xmm4, %xmm4, %xmm4
+vpunpckldq %xmm4, %xmm5, %xmm7
+vpunpckhdq %xmm4, %xmm5, %xmm5
+vpunpckldq %xmm4, %xmm3, %xmm6
+vpunpckhdq %xmm4, %xmm3, %xmm3
+vpsllq $6, %xmm5, %xmm5
+vpsllq $12, %xmm6, %xmm6
+vpsllq $18, %xmm3, %xmm3
+vpaddq %xmm7, %xmm11, %xmm11
+vpaddq %xmm5, %xmm9, %xmm9
+vpaddq %xmm6, %xmm1, %xmm1
+vpaddq %xmm3, %xmm0, %xmm0
+vpaddq 152(%rsp), %xmm2, %xmm2
+.L24:
+vpshufd $68, %xmm15, %xmm15
+vpsrlq $26, %xmm11, %xmm12
+vpsrlq $26, %xmm0, %xmm3
+vpand %xmm15, %xmm11, %xmm11
+vpand %xmm15, %xmm0, %xmm6
+vpaddq %xmm12, %xmm9, %xmm9
+vpaddq %xmm3, %xmm2, %xmm2
+vpsrlq $26, %xmm9, %xmm3
+vpsrlq $26, %xmm2, %xmm7
+vpand %xmm15, %xmm9, %xmm9
+vpand %xmm15, %xmm2, %xmm2
+vpaddq %xmm3, %xmm1, %xmm3
+vpmuludq 184(%rsp), %xmm7, %xmm7
+vpaddq %xmm7, %xmm11, %xmm7
+vpsrlq $26, %xmm3, %xmm4
+vpsrlq $26, %xmm7, %xmm1
+vpand %xmm15, %xmm3, %xmm3
+vpand %xmm15, %xmm7, %xmm7
+vpaddq %xmm4, %xmm6, %xmm6
+vpaddq %xmm1, %xmm9, %xmm12
+vpsrlq $26, %xmm6, %xmm0
+vpand %xmm15, %xmm6, %xmm6
+vpaddq %xmm0, %xmm2, %xmm2
+.L23:
+testq %rsi, %rsi
+je .L25
+vpshufd $8, %xmm7, %xmm7
+vpshufd $8, %xmm12, %xmm12
+vpshufd $8, %xmm3, %xmm3
+vpshufd $8, %xmm6, %xmm6
+vpshufd $8, %xmm2, %xmm2
+vpunpcklqdq %xmm12, %xmm7, %xmm7
+vpunpcklqdq %xmm6, %xmm3, %xmm3
+vmovdqu %xmm7, (%rdi)
+vmovdqu %xmm3, 16(%rdi)
+vmovq %xmm2, 32(%rdi)
+jmp .L11
+.L25:
+vpsrldq $8, %xmm7, %xmm0
+vpaddq %xmm0, %xmm7, %xmm7
+vpsrldq $8, %xmm12, %xmm0
+vpaddq %xmm0, %xmm12, %xmm12
+vpsrldq $8, %xmm3, %xmm0
+vpaddq %xmm0, %xmm3, %xmm3
+vpsrldq $8, %xmm6, %xmm0
+vpaddq %xmm0, %xmm6, %xmm6
+vpsrldq $8, %xmm2, %xmm0
+vpaddq %xmm0, %xmm2, %xmm2
+vmovd %xmm7, %eax
+vmovd %xmm12, %edx
+movl %eax, %r9d
+shrl $26, %r9d
+addl %edx, %r9d
+movl %r9d, %r8d
+andl $67108863, %r8d
+vmovd %xmm3, %edx
+shrl $26, %r9d
+addl %edx, %r9d
+vmovd %xmm6, %edx
+movl %r9d, %ecx
+shrl $26, %ecx
+addl %edx, %ecx
+movl %ecx, %esi
+andl $67108863, %esi
+vmovd %xmm2, %r10d
+movl %r8d, %r11d
+salq $26, %r11
+andl $67108863, %eax
+orq %rax, %r11
+movabsq $17592186044415, %rax
+andq %rax, %r11
+andl $67108863, %r9d
+salq $8, %r9
+shrl $18, %r8d
+movl %r8d, %r8d
+orq %r8, %r9
+movq %rsi, %rdx
+salq $34, %rdx
+orq %rdx, %r9
+andq %rax, %r9
+shrl $26, %ecx
+addl %r10d, %ecx
+salq $16, %rcx
+shrl $10, %esi
+movl %esi, %esi
+orq %rsi, %rcx
+movabsq $4398046511103, %r10
+movq %rcx, %r8
+andq %r10, %r8
+shrq $42, %rcx
+leaq (%rcx,%rcx,4), %rdx
+addq %r11, %rdx
+movq %rdx, %rsi
+andq %rax, %rsi
+shrq $44, %rdx
+addq %r9, %rdx
+movq %rdx, %rcx
+andq %rax, %rcx
+shrq $44, %rdx
+addq %r8, %rdx
+andq %rdx, %r10
+shrq $42, %rdx
+leaq (%rsi,%rdx,4), %rsi
+leaq (%rsi,%rdx), %r11
+movq %r11, %rbx
+andq %rax, %rbx
+shrq $44, %r11
+addq %rcx, %r11
+leaq 5(%rbx), %r9
+movq %r9, %r8
+shrq $44, %r8
+addq %r11, %r8
+movabsq $-4398046511104, %rsi
+addq %r10, %rsi
+movq %r8, %rdx
+shrq $44, %rdx
+addq %rdx, %rsi
+movq %rsi, %rdx
+shrq $63, %rdx
+subq $1, %rdx
+movq %rdx, %rcx
+notq %rcx
+andq %rcx, %rbx
+andq %rcx, %r11
+andq %r10, %rcx
+andq %rax, %r9
+andq %rdx, %r9
+orq %r9, %rbx
+movq %rbx, (%rdi)
+andq %r8, %rax
+andq %rdx, %rax
+orq %rax, %r11
+movq %r11, 8(%rdi)
+andq %rsi, %rdx
+orq %rcx, %rdx
+movq %rdx, 16(%rdi)
+.L11:
+movq -8(%rbp), %rbx
+leave
+ret
+FN_END poly1305_blocks_avx
+
+GLOBAL_HIDDEN_FN poly1305_finish_ext_avx
+poly1305_finish_ext_avx_local:
+pushq %r12
+pushq %rbp
+pushq %rbx
+subq $32, %rsp
+movq %rdi, %rbx
+movq %rdx, %rbp
+movq %rcx, %r12
+testq %rdx, %rdx
+je .L30
+movq $0, (%rsp)
+movq $0, 8(%rsp)
+movq $0, 16(%rsp)
+movq $0, 24(%rsp)
+movq %rsp, %rax
+subq %rsp, %rsi
+testb $16, %dl
+je .L31
+vmovdqu (%rsp,%rsi), %xmm0
+vmovdqa %xmm0, (%rsp)
+addq $16, %rax
+.L31:
+testb $8, %bpl
+je .L32
+movq (%rax,%rsi), %rdx
+movq %rdx, (%rax)
+addq $8, %rax
+.L32:
+testb $4, %bpl
+je .L33
+movl (%rax,%rsi), %edx
+movl %edx, (%rax)
+addq $4, %rax
+.L33:
+testb $2, %bpl
+je .L34
+movzwl (%rax,%rsi), %edx
+movw %dx, (%rax)
+addq $2, %rax
+.L34:
+testb $1, %bpl
+je .L35
+movzbl (%rax,%rsi), %edx
+movb %dl, (%rax)
+.L35:
+cmpq $16, %rbp
+je .L36
+movb $1, (%rsp,%rbp)
+movq 120(%rbx), %rdx
+cmpq $16, %rbp
+sbbq %rax, %rax
+andl $4, %eax
+addq $4, %rax
+.L37:
+orq %rdx, %rax
+movq %rax, 120(%rbx)
+movq %rsp, %rsi
+movl $32, %edx
+movq %rbx, %rdi
+call poly1305_blocks_avx_local
+.L30:
+movq 120(%rbx), %rax
+testb $1, %al
+je .L38
+subq $1, %rbp
+cmpq $15, %rbp
+jbe .L39
+orq $16, %rax
+movq %rax, 120(%rbx)
+jmp .L40
+.L39:
+orq $32, %rax
+movq %rax, 120(%rbx)
+.L40:
+movl $32, %edx
+movl $0, %esi
+movq %rbx, %rdi
+call poly1305_blocks_avx_local
+.L38:
+movq 8(%rbx), %rax
+movq %rax, %rdx
+salq $44, %rdx
+orq (%rbx), %rdx
+shrq $20, %rax
+movq 16(%rbx), %rcx
+salq $24, %rcx
+orq %rcx, %rax
+movq 104(%rbx), %rcx
+movq 112(%rbx), %rsi
+addq %rcx, %rdx
+adcq %rsi, %rax
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqu %xmm0, (%rbx)
+vmovdqu %xmm0, 16(%rbx)
+vmovdqu %xmm0, 32(%rbx)
+vmovdqu %xmm0, 48(%rbx)
+vmovdqu %xmm0, 64(%rbx)
+vmovdqu %xmm0, 80(%rbx)
+vmovdqu %xmm0, 96(%rbx)
+vmovdqu %xmm0, 112(%rbx)
+movq %rdx, (%r12)
+movq %rax, 8(%r12)
+jmp .L43
+.L36:
+movq 120(%rbx), %rdx
+movl $4, %eax
+jmp .L37
+.L43:
+addq $32, %rsp
+popq %rbx
+popq %rbp
+popq %r12
+ret
+FN_END poly1305_finish_ext_avx
+
+GLOBAL_HIDDEN_FN poly1305_auth_avx
+cmp $128, %rdx
+jb poly1305_auth_x86_local
+pushq %rbp
+movq %rsp, %rbp
+pushq %r14
+pushq %r13
+pushq %r12
+pushq %rbx
+andq $-64, %rsp
+addq $-128, %rsp
+movq %rdi, %r14
+movq %rsi, %r12
+movq %rdx, %rbx
+movq %rsp, %rdi
+movq %rcx, %rsi
+call poly1305_init_ext_avx_local
+movq %rbx, %r13
+andq $-32, %r13
+je .L46
+movq %rsp, %rdi
+movq %r13, %rdx
+movq %r12, %rsi
+call poly1305_blocks_avx_local
+addq %r13, %r12
+subq %r13, %rbx
+.L46:
+movq %rsp, %rdi
+movq %r14, %rcx
+movq %rbx, %rdx
+movq %r12, %rsi
+call poly1305_finish_ext_avx_local
+leaq -32(%rbp), %rsp
+popq %rbx
+popq %r12
+popq %r13
+popq %r14
+popq %rbp
+ret
+FN_END poly1305_auth_avx
--- /dev/null
+#include "../chacha20/macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0
+movl $64, %eax
+ret
+FN_END poly1305_block_size_avx2
+
+GLOBAL_HIDDEN_FN poly1305_auth_avx2
+cmp $128, %rdx
+jb poly1305_auth_x86_local
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+pushq %r12
+pushq %r14
+pushq %r15
+pushq %rbx
+subq $224, %rsp
+movq %rsi, %r14
+movq %rdi, %rbx
+lea (%rsp), %rdi
+movq %rcx, %rsi
+movq %rdx, %r12
+call poly1305_init_ext_avx2_local
+poly1305_auth_avx2_2:
+movq %r12, %r15
+andq $-64, %r15
+je poly1305_auth_avx2_5
+poly1305_auth_avx2_3:
+movq %r14, %rsi
+lea (%rsp), %rdi
+movq %r15, %rdx
+call poly1305_blocks_avx2_local
+poly1305_auth_avx2_4:
+addq %r15, %r14
+subq %r15, %r12
+poly1305_auth_avx2_5:
+movq %r14, %rsi
+lea (%rsp), %rdi
+movq %r12, %rdx
+movq %rbx, %rcx
+call poly1305_finish_ext_avx2_local
+poly1305_auth_avx2_6:
+addq $224, %rsp
+popq %rbx
+popq %r15
+popq %r14
+popq %r12
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_auth_avx2
+
+
+GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2
+poly1305_finish_ext_avx2_local:
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+pushq %r12
+pushq %r13
+pushq %r14
+subq $104, %rsp
+movq %rdx, %r13
+movq %rcx, %r14
+movq %rdi, %r12
+testq %r13, %r13
+je poly1305_finish_ext_avx2_29
+poly1305_finish_ext_avx2_2:
+lea (%rsp), %rax
+vpxor %ymm0, %ymm0, %ymm0
+subq %rax, %rsi
+vmovdqu %ymm0, (%rsp)
+vmovdqu %ymm0, 32(%rsp)
+testq $32, %r13
+je poly1305_finish_ext_avx2_4
+poly1305_finish_ext_avx2_3:
+vmovdqu (%rsp,%rsi), %ymm0
+lea 32(%rsp), %rax
+vmovdqu %ymm0, (%rsp)
+poly1305_finish_ext_avx2_4:
+testq $16, %r13
+je poly1305_finish_ext_avx2_6
+poly1305_finish_ext_avx2_5:
+vmovdqu (%rax,%rsi), %xmm0
+vmovdqu %xmm0, (%rax)
+addq $16, %rax
+poly1305_finish_ext_avx2_6:
+testq $8, %r13
+je poly1305_finish_ext_avx2_8
+poly1305_finish_ext_avx2_7:
+movq (%rax,%rsi), %rdx
+movq %rdx, (%rax)
+addq $8, %rax
+poly1305_finish_ext_avx2_8:
+testq $4, %r13
+je poly1305_finish_ext_avx2_10
+poly1305_finish_ext_avx2_9:
+movl (%rax,%rsi), %edx
+movl %edx, (%rax)
+addq $4, %rax
+poly1305_finish_ext_avx2_10:
+testq $2, %r13
+je poly1305_finish_ext_avx2_12
+poly1305_finish_ext_avx2_11:
+movzwl (%rax,%rsi), %edx
+movw %dx, (%rax)
+addq $2, %rax
+poly1305_finish_ext_avx2_12:
+testq $1, %r13
+je poly1305_finish_ext_avx2_14
+poly1305_finish_ext_avx2_13:
+movb (%rax,%rsi), %dl
+movb %dl, (%rax)
+poly1305_finish_ext_avx2_14:
+testq $15, %r13
+je poly1305_finish_ext_avx2_16
+poly1305_finish_ext_avx2_15:
+movb $1, (%rsp,%r13)
+poly1305_finish_ext_avx2_16:
+movq 176(%r12), %rdx
+andq $-8125, %rdx
+cmpq $48, %r13
+jb poly1305_finish_ext_avx2_18
+poly1305_finish_ext_avx2_17:
+orq $4, %rdx
+jmp poly1305_finish_ext_avx2_21
+poly1305_finish_ext_avx2_18:
+cmpq $32, %r13
+jb poly1305_finish_ext_avx2_20
+poly1305_finish_ext_avx2_19:
+orq $8, %rdx
+jmp poly1305_finish_ext_avx2_21
+poly1305_finish_ext_avx2_20:
+movq %rdx, %rax
+orq $32, %rdx
+orq $16, %rax
+cmpq $16, %r13
+cmovae %rax, %rdx
+poly1305_finish_ext_avx2_21:
+testq $1, %rdx
+je poly1305_finish_ext_avx2_27
+poly1305_finish_ext_avx2_22:
+cmpq $16, %r13
+ja poly1305_finish_ext_avx2_24
+poly1305_finish_ext_avx2_23:
+orq $256, %rdx
+movq %rdx, 176(%r12)
+jmp poly1305_finish_ext_avx2_28
+poly1305_finish_ext_avx2_24:
+cmpq $32, %r13
+ja poly1305_finish_ext_avx2_27
+poly1305_finish_ext_avx2_25:
+orq $128, %rdx
+movq %rdx, 176(%r12)
+jmp poly1305_finish_ext_avx2_28
+poly1305_finish_ext_avx2_27:
+movq %rdx, 176(%r12)
+poly1305_finish_ext_avx2_28:
+movq %r12, %rdi
+lea (%rsp), %rsi
+movl $64, %edx
+vzeroupper
+call poly1305_blocks_avx2_local
+poly1305_finish_ext_avx2_29:
+movq 176(%r12), %rdx
+testq $1, %rdx
+je poly1305_finish_ext_avx2_37
+poly1305_finish_ext_avx2_30:
+andq $-8125, %rdx
+testq %r13, %r13
+je poly1305_finish_ext_avx2_32
+poly1305_finish_ext_avx2_31:
+cmpq $48, %r13
+jbe poly1305_finish_ext_avx2_33
+poly1305_finish_ext_avx2_32:
+orq $512, %rdx
+jmp poly1305_finish_ext_avx2_36
+poly1305_finish_ext_avx2_33:
+cmpq $32, %r13
+jbe poly1305_finish_ext_avx2_35
+poly1305_finish_ext_avx2_34:
+orq $1024, %rdx
+jmp poly1305_finish_ext_avx2_36
+poly1305_finish_ext_avx2_35:
+movq %rdx, %rax
+orq $4096, %rdx
+orq $2048, %rax
+cmpq $16, %r13
+cmova %rax, %rdx
+poly1305_finish_ext_avx2_36:
+orq $96, %rdx
+movq %r12, %rdi
+vpxor %ymm0, %ymm0, %ymm0
+lea (%rsp), %rsi
+movq %rdx, 176(%r12)
+movl $64, %edx
+vmovdqu %ymm0, (%rsp)
+vmovdqu %ymm0, 32(%rsp)
+vzeroupper
+call poly1305_blocks_avx2_local
+poly1305_finish_ext_avx2_37:
+movq 8(%r12), %r8
+movq %r8, %rsi
+movq 16(%r12), %rax
+vpxor %ymm0, %ymm0, %ymm0
+shlq $44, %rsi
+shrq $20, %r8
+shlq $24, %rax
+orq (%r12), %rsi
+orq %rax, %r8
+movq 160(%r12), %rdx
+movq 168(%r12), %rcx
+addq %rdx, %rsi
+adcq %rcx, %r8
+vmovdqu %ymm0, (%r12)
+vmovdqu %ymm0, 32(%r12)
+vmovdqu %ymm0, 64(%r12)
+vmovdqu %ymm0, 96(%r12)
+vmovdqu %ymm0, 128(%r12)
+vmovdqu %ymm0, 160(%r12)
+movq %rsi, (%r14)
+movq %r8, 8(%r14)
+vzeroupper
+addq $104, %rsp
+popq %r14
+popq %r13
+popq %r12
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_finish_ext_avx2
+
+GLOBAL_HIDDEN_FN poly1305_blocks_avx2
+poly1305_blocks_avx2_local:
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+subq $384, %rsp
+movl $16777216, %eax
+movl $67108863, %ecx
+movl $5, %r8d
+vmovd %eax, %xmm1
+vmovd %ecx, %xmm10
+vmovd %r8d, %xmm0
+movq 176(%rdi), %rax
+vpbroadcastq %xmm1, %ymm1
+vpbroadcastq %xmm10, %ymm10
+vpbroadcastq %xmm0, %ymm11
+testq $60, %rax
+je poly1305_blocks_avx2_11
+poly1305_blocks_avx2_2:
+vpsrldq $8, %ymm1, %ymm15
+testq $4, %rax
+je poly1305_blocks_avx2_4
+poly1305_blocks_avx2_3:
+vpermq $192, %ymm15, %ymm15
+poly1305_blocks_avx2_4:
+testq $8, %rax
+je poly1305_blocks_avx2_6
+poly1305_blocks_avx2_5:
+vpermq $240, %ymm15, %ymm15
+poly1305_blocks_avx2_6:
+testq $16, %rax
+je poly1305_blocks_avx2_8
+poly1305_blocks_avx2_7:
+vpermq $252, %ymm15, %ymm15
+poly1305_blocks_avx2_8:
+testq $32, %rax
+je poly1305_blocks_avx2_10
+poly1305_blocks_avx2_9:
+vpxor %ymm15, %ymm15, %ymm15
+poly1305_blocks_avx2_10:
+vmovdqa %ymm15, %ymm1
+poly1305_blocks_avx2_11:
+movq %rax, %rcx
+btsq $0, %rcx
+jc poly1305_blocks_avx2_13
+poly1305_blocks_avx2_12:
+vmovdqu (%rsi), %ymm3
+movq %rcx, %rax
+vmovdqu 32(%rsi), %ymm5
+vpunpcklqdq %ymm5, %ymm3, %ymm4
+addq $64, %rsi
+vpunpckhqdq %ymm5, %ymm3, %ymm7
+vpermq $216, %ymm4, %ymm6
+addq $-64, %rdx
+vpermq $216, %ymm7, %ymm0
+vpsrlq $52, %ymm6, %ymm8
+vpsllq $12, %ymm0, %ymm9
+vpsrlq $26, %ymm6, %ymm2
+vpsrlq $40, %ymm0, %ymm0
+vpand %ymm6, %ymm10, %ymm4
+vpor %ymm9, %ymm8, %ymm7
+vpand %ymm2, %ymm10, %ymm3
+vpor %ymm1, %ymm0, %ymm9
+vpsrlq $26, %ymm7, %ymm2
+vpand %ymm7, %ymm10, %ymm5
+vpand %ymm2, %ymm10, %ymm7
+movq %rax, 176(%rdi)
+jmp poly1305_blocks_avx2_14
+poly1305_blocks_avx2_13:
+vpermq $216, (%rdi), %ymm15
+vpxor %ymm0, %ymm0, %ymm0
+vpermq $216, 32(%rdi), %ymm14
+vpermq $216, 64(%rdi), %ymm13
+vpunpckldq %ymm0, %ymm15, %ymm4
+vpunpckhdq %ymm0, %ymm15, %ymm3
+vpunpckldq %ymm0, %ymm14, %ymm5
+vpunpckhdq %ymm0, %ymm14, %ymm7
+vpunpckldq %ymm0, %ymm13, %ymm9
+poly1305_blocks_avx2_14:
+cmpq $64, %rdx
+jb poly1305_blocks_avx2_34
+poly1305_blocks_avx2_15:
+vmovdqu 140(%rdi), %ymm0
+testq $8064, %rax
+je poly1305_blocks_avx2_29
+poly1305_blocks_avx2_16:
+vpermq $216, 80(%rdi), %ymm6
+vpermq $216, 100(%rdi), %ymm2
+vpermq $216, 120(%rdi), %ymm8
+vpermq $216, %ymm0, %ymm0
+testq $128, %rax
+je poly1305_blocks_avx2_18
+poly1305_blocks_avx2_17:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm0, %ymm14
+vmovdqa %ymm0, %ymm13
+vmovdqa %ymm8, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_18:
+testq $256, %rax
+je poly1305_blocks_avx2_20
+poly1305_blocks_avx2_19:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm0, %ymm14
+vmovdqa %ymm8, %ymm13
+vmovdqa %ymm2, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_20:
+testq $512, %rax
+je poly1305_blocks_avx2_22
+poly1305_blocks_avx2_21:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm8, %ymm14
+vmovdqa %ymm2, %ymm13
+vmovdqa %ymm6, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_22:
+testq $1024, %rax
+je poly1305_blocks_avx2_24
+poly1305_blocks_avx2_23:
+vpxor %ymm12, %ymm12, %ymm12
+movl $1, %r8d
+vmovdqa %ymm8, %ymm15
+vmovdqa %ymm2, %ymm14
+vmovdqa %ymm6, %ymm13
+vmovd %r8d, %xmm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_24:
+testq $2048, %rax
+je poly1305_blocks_avx2_26
+poly1305_blocks_avx2_25:
+vpxor %ymm12, %ymm12, %ymm12
+movl $1, %r8d
+vmovd %r8d, %xmm13
+vmovdqa %ymm2, %ymm15
+vmovdqa %ymm6, %ymm14
+vmovdqa %ymm13, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_26:
+testq $4096, %rax
+je poly1305_blocks_avx2_28
+poly1305_blocks_avx2_27:
+movl $1, %r8d
+vmovd %r8d, %xmm14
+vmovdqa %ymm6, %ymm15
+vmovdqa %ymm14, %ymm13
+vmovdqa %ymm14, %ymm12
+poly1305_blocks_avx2_28:
+vpunpcklqdq %ymm14, %ymm15, %ymm6
+vpunpcklqdq %ymm12, %ymm13, %ymm8
+vpunpckhqdq %ymm14, %ymm15, %ymm14
+vpunpckhqdq %ymm12, %ymm13, %ymm12
+vperm2i128 $32, %ymm8, %ymm6, %ymm2
+vperm2i128 $49, %ymm8, %ymm6, %ymm6
+vpsrlq $32, %ymm6, %ymm0
+vpsrlq $32, %ymm2, %ymm8
+vmovdqu %ymm0, 352(%rsp)
+vperm2i128 $32, %ymm12, %ymm14, %ymm13
+vmovdqu %ymm13, 320(%rsp)
+jmp poly1305_blocks_avx2_30
+poly1305_blocks_avx2_29:
+vpsrlq $32, %ymm0, %ymm12
+vpermq $0, %ymm0, %ymm2
+vpermq $85, %ymm0, %ymm6
+vpermq $85, %ymm12, %ymm13
+vpermq $170, %ymm0, %ymm0
+vpermq $0, %ymm12, %ymm8
+vmovdqu %ymm13, 352(%rsp)
+vmovdqu %ymm0, 320(%rsp)
+poly1305_blocks_avx2_30:
+vmovdqu (%rsi), %ymm12
+movq %rdx, %r9
+vmovdqu 352(%rsp), %ymm15
+vmovdqu %ymm1, 160(%rsp)
+vmovdqu %ymm10, 192(%rsp)
+vmovdqu %ymm11, 128(%rsp)
+vperm2i128 $32, 32(%rsi), %ymm12, %ymm13
+xorl %r8d, %r8d
+vperm2i128 $49, 32(%rsi), %ymm12, %ymm12
+xorl %ecx, %ecx
+vpmuludq %ymm11, %ymm8, %ymm0
+vpmuludq %ymm11, %ymm6, %ymm1
+vmovdqu %ymm0, 224(%rsp)
+vmovdqu %ymm1, 256(%rsp)
+vpunpckldq %ymm12, %ymm13, %ymm14
+vpunpckhdq %ymm12, %ymm13, %ymm12
+vmovdqu %ymm14, 32(%rsp)
+vpmuludq %ymm0, %ymm9, %ymm0
+vpmuludq %ymm1, %ymm7, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vpmuludq %ymm11, %ymm15, %ymm10
+vpmuludq %ymm10, %ymm5, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vmovdqu %ymm10, 288(%rsp)
+vpmuludq 320(%rsp), %ymm11, %ymm11
+vpmuludq %ymm11, %ymm3, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vmovdqu %ymm11, (%rsp)
+vpmuludq %ymm2, %ymm4, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vpxor %ymm13, %ymm13, %ymm13
+vpunpckldq %ymm13, %ymm14, %ymm14
+vpaddq %ymm14, %ymm0, %ymm0
+vmovdqu %ymm0, 64(%rsp)
+vpmuludq %ymm11, %ymm9, %ymm14
+vpmuludq %ymm2, %ymm7, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm8, %ymm5, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm6, %ymm3, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm15, %ymm4, %ymm0
+vpaddq %ymm0, %ymm14, %ymm0
+vpunpckhdq %ymm13, %ymm12, %ymm14
+vpsllq $18, %ymm14, %ymm14
+vpaddq %ymm14, %ymm0, %ymm14
+vpmuludq %ymm1, %ymm9, %ymm1
+vpmuludq %ymm10, %ymm7, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm11, %ymm5, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm2, %ymm3, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm8, %ymm4, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu 32(%rsp), %ymm0
+vpunpckhdq %ymm13, %ymm0, %ymm0
+vpsllq $6, %ymm0, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu 64(%rsp), %ymm0
+vpsrlq $26, %ymm0, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu %ymm1, 96(%rsp)
+vpmuludq %ymm2, %ymm9, %ymm1
+vpmuludq %ymm8, %ymm7, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm10, %ymm9, %ymm10
+vpmuludq %ymm11, %ymm7, %ymm11
+vpaddq %ymm11, %ymm10, %ymm7
+vpmuludq %ymm6, %ymm5, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm2, %ymm5, %ymm5
+vpaddq %ymm5, %ymm7, %ymm10
+vpmuludq %ymm15, %ymm3, %ymm15
+vpaddq %ymm15, %ymm1, %ymm1
+vpmuludq %ymm8, %ymm3, %ymm11
+vpaddq %ymm11, %ymm10, %ymm5
+vpunpckldq %ymm13, %ymm12, %ymm10
+vmovdqu 96(%rsp), %ymm12
+vpmuludq 320(%rsp), %ymm4, %ymm0
+vpaddq %ymm0, %ymm1, %ymm15
+vpsrlq $26, %ymm12, %ymm3
+vmovdqu 160(%rsp), %ymm1
+vpmuludq %ymm6, %ymm4, %ymm4
+vpaddq %ymm1, %ymm15, %ymm0
+vpsrlq $26, %ymm14, %ymm15
+vpaddq %ymm4, %ymm5, %ymm11
+vpsllq $12, %ymm10, %ymm4
+vmovdqu 192(%rsp), %ymm10
+vpaddq %ymm15, %ymm0, %ymm0
+vpaddq %ymm4, %ymm11, %ymm5
+vmovdqu 128(%rsp), %ymm11
+vpsrlq $26, %ymm0, %ymm9
+vpaddq %ymm3, %ymm5, %ymm7
+vpand 64(%rsp), %ymm10, %ymm13
+vpand %ymm10, %ymm12, %ymm12
+vpand %ymm10, %ymm7, %ymm5
+vpsrlq $26, %ymm7, %ymm7
+vpmuludq %ymm11, %ymm9, %ymm15
+vpand %ymm10, %ymm14, %ymm9
+vpaddq %ymm15, %ymm13, %ymm3
+vpand %ymm10, %ymm0, %ymm14
+vpaddq %ymm7, %ymm9, %ymm9
+vpand %ymm10, %ymm3, %ymm4
+vpsrlq $26, %ymm3, %ymm3
+vpsrlq $26, %ymm9, %ymm0
+vpand %ymm10, %ymm9, %ymm7
+vpaddq %ymm3, %ymm12, %ymm3
+vpaddq %ymm0, %ymm14, %ymm9
+sarq $5, %r9
+shrq $58, %r9
+addq %rdx, %r9
+sarq $6, %r9
+cmpq $2, %r9
+jl poly1305_blocks_avx2_34
+poly1305_blocks_avx2_31:
+vmovdqu %ymm6, 32(%rsp)
+lea -64(%rdx), %r9
+vmovdqu %ymm8, 64(%rsp)
+vmovdqu %ymm11, 128(%rsp)
+vmovdqu %ymm10, 192(%rsp)
+vmovdqu %ymm1, 160(%rsp)
+vmovdqu (%rsp), %ymm12
+sarq $5, %r9
+shrq $58, %r9
+lea -64(%rdx,%r9), %rdx
+sarq $6, %rdx
+poly1305_blocks_avx2_32:
+vmovdqu 256(%rsp), %ymm15
+incq %r8
+vmovdqu 64(%rcx,%rsi), %ymm11
+vpmuludq 224(%rsp), %ymm9, %ymm8
+vpmuludq %ymm15, %ymm7, %ymm14
+vpaddq %ymm14, %ymm8, %ymm1
+vmovdqu 288(%rsp), %ymm8
+vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10
+vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6
+addq $64, %rcx
+vpmuludq %ymm8, %ymm5, %ymm13
+vpunpckldq %ymm6, %ymm10, %ymm0
+vpunpckhdq %ymm6, %ymm10, %ymm11
+vpaddq %ymm13, %ymm1, %ymm10
+vpmuludq %ymm12, %ymm3, %ymm6
+vpaddq %ymm6, %ymm10, %ymm14
+vpxor %ymm10, %ymm10, %ymm10
+vpunpckldq %ymm10, %ymm0, %ymm6
+vpunpckhdq %ymm10, %ymm0, %ymm0
+vpmuludq %ymm2, %ymm4, %ymm1
+vpaddq %ymm1, %ymm14, %ymm13
+vpaddq %ymm6, %ymm13, %ymm1
+vmovdqu 64(%rsp), %ymm6
+vmovdqu %ymm1, (%rsp)
+vpsrlq $26, %ymm1, %ymm1
+vpmuludq %ymm12, %ymm9, %ymm14
+vpmuludq %ymm2, %ymm7, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq %ymm6, %ymm5, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq 32(%rsp), %ymm3, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq 352(%rsp), %ymm4, %ymm13
+vpaddq %ymm13, %ymm14, %ymm13
+vpunpckhdq %ymm10, %ymm11, %ymm14
+vpsllq $18, %ymm14, %ymm14
+vpaddq %ymm14, %ymm13, %ymm13
+vpmuludq %ymm15, %ymm9, %ymm15
+vpmuludq %ymm8, %ymm7, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm12, %ymm5, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm2, %ymm3, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm6, %ymm4, %ymm14
+vpaddq %ymm14, %ymm15, %ymm14
+vpsllq $6, %ymm0, %ymm15
+vpaddq %ymm15, %ymm14, %ymm14
+vmovdqu 32(%rsp), %ymm15
+vpaddq %ymm1, %ymm14, %ymm1
+vpmuludq %ymm2, %ymm9, %ymm0
+vpmuludq %ymm6, %ymm7, %ymm14
+vpmuludq %ymm8, %ymm9, %ymm9
+vpmuludq %ymm12, %ymm7, %ymm7
+vpaddq %ymm7, %ymm9, %ymm7
+vpaddq %ymm14, %ymm0, %ymm0
+vpsrlq $26, %ymm1, %ymm9
+vpmuludq %ymm15, %ymm5, %ymm14
+vpmuludq %ymm2, %ymm5, %ymm5
+vpaddq %ymm5, %ymm7, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpmuludq 352(%rsp), %ymm3, %ymm14
+vpmuludq %ymm6, %ymm3, %ymm3
+vpaddq %ymm3, %ymm5, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpmuludq 320(%rsp), %ymm4, %ymm14
+vpmuludq %ymm15, %ymm4, %ymm4
+vpaddq %ymm4, %ymm5, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpunpckldq %ymm10, %ymm11, %ymm4
+vpaddq 160(%rsp), %ymm0, %ymm14
+vpsrlq $26, %ymm13, %ymm0
+vpsllq $12, %ymm4, %ymm3
+vpaddq %ymm0, %ymm14, %ymm14
+vpaddq %ymm3, %ymm5, %ymm7
+vpsrlq $26, %ymm14, %ymm0
+vpaddq %ymm9, %ymm7, %ymm10
+vmovdqu 192(%rsp), %ymm9
+vpsrlq $26, %ymm10, %ymm11
+vpand (%rsp), %ymm9, %ymm6
+vpand %ymm9, %ymm13, %ymm13
+vpand %ymm9, %ymm1, %ymm1
+vpand %ymm9, %ymm14, %ymm14
+vpand %ymm9, %ymm10, %ymm5
+vpmuludq 128(%rsp), %ymm0, %ymm8
+vpaddq %ymm8, %ymm6, %ymm15
+vpaddq %ymm11, %ymm13, %ymm0
+vpsrlq $26, %ymm15, %ymm3
+vpand %ymm9, %ymm0, %ymm7
+vpsrlq $26, %ymm0, %ymm0
+vpand %ymm9, %ymm15, %ymm4
+vpaddq %ymm3, %ymm1, %ymm3
+vpaddq %ymm0, %ymm14, %ymm9
+cmpq %rdx, %r8
+jb poly1305_blocks_avx2_32
+poly1305_blocks_avx2_34:
+testq $64, %rax
+jne poly1305_blocks_avx2_36
+poly1305_blocks_avx2_35:
+vpshufd $8, %ymm4, %ymm0
+vpshufd $8, %ymm3, %ymm3
+vpshufd $8, %ymm5, %ymm5
+vpshufd $8, %ymm7, %ymm7
+vpshufd $8, %ymm9, %ymm9
+vpermq $8, %ymm0, %ymm1
+vpermq $8, %ymm3, %ymm2
+vpermq $8, %ymm5, %ymm4
+vpermq $8, %ymm7, %ymm6
+vpermq $8, %ymm9, %ymm11
+vperm2i128 $32, %ymm2, %ymm1, %ymm8
+vperm2i128 $32, %ymm6, %ymm4, %ymm10
+vmovdqu %ymm8, (%rdi)
+vmovdqu %ymm10, 32(%rdi)
+vmovdqu %xmm11, 64(%rdi)
+jmp poly1305_blocks_avx2_37
+poly1305_blocks_avx2_36:
+vpermq $245, %ymm4, %ymm0
+vpaddq %ymm0, %ymm4, %ymm4
+vpermq $245, %ymm3, %ymm1
+vpaddq %ymm1, %ymm3, %ymm10
+vpermq $245, %ymm5, %ymm3
+vpermq $170, %ymm4, %ymm6
+vpaddq %ymm3, %ymm5, %ymm13
+vpaddq %ymm6, %ymm4, %ymm8
+vpermq $170, %ymm10, %ymm11
+vpermq $245, %ymm7, %ymm5
+vpaddq %ymm11, %ymm10, %ymm12
+vpaddq %ymm5, %ymm7, %ymm7
+vpermq $170, %ymm13, %ymm14
+vpermq $245, %ymm9, %ymm2
+vpaddq %ymm14, %ymm13, %ymm15
+vpaddq %ymm2, %ymm9, %ymm9
+vpermq $170, %ymm7, %ymm0
+vpaddq %ymm0, %ymm7, %ymm1
+vpermq $170, %ymm9, %ymm2
+vpaddq %ymm2, %ymm9, %ymm3
+vmovd %xmm8, %r9d
+movl %r9d, %r8d
+shrl $26, %r8d
+andq $67108863, %r9
+vmovd %xmm12, %esi
+addl %r8d, %esi
+movl %esi, %r11d
+shrl $26, %esi
+andq $67108863, %r11
+vmovd %xmm15, %ecx
+addl %esi, %ecx
+movl %ecx, %eax
+shrl $26, %eax
+andq $67108863, %rcx
+shlq $8, %rcx
+vmovd %xmm1, %r8d
+addl %eax, %r8d
+movl %r8d, %r10d
+shrl $26, %r8d
+andq $67108863, %r10
+movq %r10, %rax
+shrq $10, %rax
+shlq $34, %r10
+vmovd %xmm3, %edx
+addl %r8d, %edx
+shlq $16, %rdx
+orq %rdx, %rax
+movq %rax, %r8
+shrq $42, %r8
+lea (%r8,%r8,4), %rdx
+movq %r11, %r8
+shlq $26, %r8
+orq %r8, %r9
+movq $0xfffffffffff, %r8
+shrq $18, %r11
+andq %r8, %r9
+addq %r9, %rdx
+orq %rcx, %r11
+movq %rdx, %rsi
+orq %r10, %r11
+shrq $44, %rsi
+andq %r8, %r11
+addq %r11, %rsi
+movq $0x3ffffffffff, %r9
+movq %rsi, %r10
+andq %r9, %rax
+shrq $44, %r10
+andq %r8, %rdx
+addq %r10, %rax
+movq %r8, %rcx
+andq %rax, %r9
+andq %r8, %rsi
+shrq $42, %rax
+movq $0xfffffc0000000000, %r10
+lea (%rax,%rax,4), %r11
+addq %r11, %rdx
+andq %rdx, %rcx
+shrq $44, %rdx
+addq %rdx, %rsi
+lea 5(%rcx), %rdx
+movq %rdx, %r11
+andq %r8, %rdx
+shrq $44, %r11
+addq %rsi, %r11
+movq %r11, %rax
+andq %r11, %r8
+shrq $44, %rax
+addq %r9, %rax
+addq %r10, %rax
+movq %rax, %r10
+shrq $63, %r10
+decq %r10
+andn %rcx, %r10, %rcx
+andq %r10, %rdx
+orq %rdx, %rcx
+andq %r10, %r8
+andn %rsi, %r10, %rdx
+andq %r10, %rax
+andn %r9, %r10, %rsi
+orq %r8, %rdx
+orq %rax, %rsi
+movq %rcx, (%rdi)
+movq %rdx, 8(%rdi)
+movq %rsi, 16(%rdi)
+poly1305_blocks_avx2_37:
+vzeroupper
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_blocks_avx2
+
+GLOBAL_HIDDEN_FN poly1305_init_ext_avx2
+poly1305_init_ext_avx2_local:
+pushq %r12
+pushq %r13
+pushq %r14
+pushq %r15
+pushq %rbx
+movq %rdi, %r10
+vpxor %ymm0, %ymm0, %ymm0
+movq %rdx, %r12
+vpxor %xmm1, %xmm1, %xmm1
+vmovdqu %xmm1, 64(%r10)
+vmovdqu %ymm0, (%r10)
+vmovdqu %ymm0, 32(%r10)
+movq $-1, %r8
+testq %r12, %r12
+movq 8(%rsi), %rdi
+movq $0xffc0fffffff, %r9
+movq %rdi, %rcx
+cmove %r8, %r12
+movq (%rsi), %r8
+andq %r8, %r9
+shrq $44, %r8
+movq $0xfffffc0ffff, %r11
+shlq $20, %rcx
+shrq $24, %rdi
+orq %rcx, %r8
+movq $0xffffffc0f, %rcx
+andq %r11, %r8
+andq %rcx, %rdi
+movq 16(%rsi), %rcx
+movq %rcx, 160(%r10)
+movq %r9, %rcx
+movq 24(%rsi), %rdx
+movq %rdx, 168(%r10)
+movl %r9d, %edx
+andl $67108863, %edx
+movl %edx, 80(%r10)
+movq %r8, %rdx
+shrq $26, %rcx
+shlq $18, %rdx
+orq %rdx, %rcx
+movq %r8, %rdx
+shrq $8, %rdx
+andl $67108863, %ecx
+andl $67108863, %edx
+movl %ecx, 84(%r10)
+movq %r8, %rcx
+movl %edx, 88(%r10)
+movq %rdi, %rdx
+shrq $34, %rcx
+shlq $10, %rdx
+orq %rdx, %rcx
+movq %rdi, %rdx
+shrq $16, %rdx
+andl $67108863, %ecx
+movl %ecx, 92(%r10)
+movl %edx, 96(%r10)
+cmpq $16, %r12
+jbe poly1305_init_ext_avx2_7
+poly1305_init_ext_avx2_2:
+movq %r9, %rax
+lea (%rdi,%rdi,4), %r14
+mulq %r9
+shlq $2, %r14
+movq %rax, %r11
+movq %rdx, %r15
+lea (%r8,%r8), %rax
+mulq %r14
+addq %rax, %r11
+lea (%r9,%r9), %rax
+movq %r11, %rsi
+adcq %rdx, %r15
+mulq %r8
+movq %rax, %rbx
+movq %r14, %rax
+movq %rdx, %rcx
+lea (%rdi,%rdi), %r14
+mulq %rdi
+addq %rax, %rbx
+movq %r8, %rax
+adcq %rdx, %rcx
+mulq %r8
+shlq $20, %r15
+movq %rax, %r13
+shrq $44, %rsi
+movq %r9, %rax
+orq %rsi, %r15
+movq %rdx, %rsi
+mulq %r14
+addq %r15, %rbx
+movq %rbx, %r15
+adcq $0, %rcx
+addq %rax, %r13
+adcq %rdx, %rsi
+shlq $20, %rcx
+shrq $44, %r15
+orq %r15, %rcx
+addq %rcx, %r13
+movq $0xfffffffffff, %rcx
+movq %r13, %rdx
+adcq $0, %rsi
+andq %rcx, %r11
+shlq $22, %rsi
+andq %rcx, %rbx
+shrq $42, %rdx
+orq %rdx, %rsi
+lea (%rsi,%rsi,4), %rsi
+addq %rsi, %r11
+movq %rcx, %rsi
+andq %r11, %rsi
+shrq $44, %r11
+addq %r11, %rbx
+movq $0x3ffffffffff, %r11
+andq %rbx, %rcx
+andq %r11, %r13
+shrq $44, %rbx
+movq %rsi, %r11
+movq %rcx, %rdx
+addq %r13, %rbx
+shrq $26, %r11
+movq %rbx, %r15
+shlq $18, %rdx
+movq %rcx, %r14
+orq %rdx, %r11
+movq %rcx, %rdx
+shrq $34, %rdx
+movl %esi, %r13d
+shlq $10, %r15
+andl $67108863, %r13d
+orq %r15, %rdx
+andl $67108863, %r11d
+shrq $8, %r14
+andl $67108863, %edx
+movl %edx, 112(%r10)
+movq %rbx, %rdx
+shrq $16, %rdx
+andl $67108863, %r14d
+movl %r13d, 100(%r10)
+movl %r11d, 104(%r10)
+movl %r14d, 108(%r10)
+movl %edx, 116(%r10)
+cmpq $48, %r12
+jbe poly1305_init_ext_avx2_4
+poly1305_init_ext_avx2_3:
+movq %rsi, %rax
+lea (%rbx,%rbx,4), %r15
+mulq %rsi
+shlq $2, %r15
+movq %rax, %r13
+movq %rdx, %r12
+lea (%rcx,%rcx), %rax
+mulq %r15
+addq %rax, %r13
+lea (%rsi,%rsi), %rax
+movq %r15, -16(%rsp)
+adcq %rdx, %r12
+mulq %rcx
+movq %rax, %r14
+movq %rbx, %rax
+movq %rdx, %r11
+mulq %r15
+addq %rax, %r14
+movq %rcx, %rax
+movq %r13, %r15
+adcq %rdx, %r11
+mulq %rcx
+shlq $20, %r12
+shrq $44, %r15
+orq %r15, %r12
+movq %rax, %r15
+addq %r12, %r14
+movq %rdx, %r12
+movq %rsi, %rax
+lea (%rbx,%rbx), %rdx
+adcq $0, %r11
+mulq %rdx
+addq %rax, %r15
+adcq %rdx, %r12
+movq %r14, %rdx
+shlq $20, %r11
+shrq $44, %rdx
+orq %rdx, %r11
+addq %r11, %r15
+movq $0xfffffffffff, %r11
+movq %r15, %rdx
+adcq $0, %r12
+andq %r11, %r13
+shlq $22, %r12
+andq %r11, %r14
+shrq $42, %rdx
+orq %rdx, %r12
+lea (%r12,%r12,4), %r12
+addq %r12, %r13
+movq %r11, %r12
+andq %r13, %r12
+shrq $44, %r13
+addq %r13, %r14
+movq $0x3ffffffffff, %r13
+andq %r14, %r11
+andq %r13, %r15
+shrq $44, %r14
+movq %r11, %rdx
+shlq $18, %rdx
+addq %r14, %r15
+movl %r12d, %r14d
+movq %r11, %r13
+shrq $26, %r12
+andl $67108863, %r14d
+orq %rdx, %r12
+movq %r15, %rdx
+shrq $34, %r11
+shlq $10, %rdx
+andl $67108863, %r12d
+orq %rdx, %r11
+shrq $8, %r13
+andl $67108863, %r11d
+movl %r11d, 152(%r10)
+andl $67108863, %r13d
+shrq $16, %r15
+movl %r14d, 140(%r10)
+movl %r12d, 144(%r10)
+movl %r13d, 148(%r10)
+movl %r15d, 156(%r10)
+movq -16(%rsp), %r11
+jmp poly1305_init_ext_avx2_6
+poly1305_init_ext_avx2_4:
+cmpq $32, %r12
+jbe poly1305_init_ext_avx2_7
+poly1305_init_ext_avx2_5:
+lea (%rbx,%rbx,4), %r11
+shlq $2, %r11
+poly1305_init_ext_avx2_6:
+movq %r9, %rax
+lea (%rcx,%rcx,4), %r13
+mulq %rsi
+shlq $2, %r13
+movq %rax, %r14
+movq %rdi, %rax
+movq %rdx, %r12
+mulq %r13
+addq %rax, %r14
+movq %r8, %rax
+adcq %rdx, %r12
+mulq %r11
+addq %rax, %r14
+movq %r8, %rax
+adcq %rdx, %r12
+mulq %rsi
+movq %rax, %r15
+movq %r9, %rax
+movq %rdx, %r13
+mulq %rcx
+addq %rax, %r15
+movq %r11, %rax
+movq %r14, %r11
+adcq %rdx, %r13
+mulq %rdi
+addq %rax, %r15
+movq %rdi, %rax
+adcq %rdx, %r13
+mulq %rsi
+shlq $20, %r12
+movq %rax, %rsi
+shrq $44, %r11
+movq %r8, %rax
+orq %r11, %r12
+movq %rdx, %rdi
+mulq %rcx
+addq %r12, %r15
+movq %r15, %rcx
+adcq $0, %r13
+addq %rax, %rsi
+movq %r9, %rax
+movq $0xfffffffffff, %r9
+adcq %rdx, %rdi
+andq %r9, %r14
+mulq %rbx
+addq %rax, %rsi
+adcq %rdx, %rdi
+movq %r9, %rdx
+shlq $20, %r13
+andq %r9, %r15
+shrq $44, %rcx
+orq %rcx, %r13
+addq %r13, %rsi
+movq %rsi, %rbx
+adcq $0, %rdi
+shlq $22, %rdi
+shrq $42, %rbx
+orq %rbx, %rdi
+lea (%rdi,%rdi,4), %r8
+addq %r8, %r14
+andq %r14, %rdx
+shrq $44, %r14
+addq %r14, %r15
+movq $0x3ffffffffff, %r14
+andq %r15, %r9
+andq %r14, %rsi
+shrq $44, %r15
+movq %r9, %rax
+addq %r15, %rsi
+movl %edx, %r15d
+movq %rsi, %rbx
+movq %r9, %rcx
+shrq $26, %rdx
+andl $67108863, %r15d
+shlq $18, %rax
+shrq $34, %r9
+orq %rax, %rdx
+shlq $10, %rbx
+shrq $8, %rcx
+orq %rbx, %r9
+shrq $16, %rsi
+andl $67108863, %edx
+andl $67108863, %ecx
+andl $67108863, %r9d
+movl %r15d, 120(%r10)
+movl %edx, 124(%r10)
+movl %ecx, 128(%r10)
+movl %r9d, 132(%r10)
+movl %esi, 136(%r10)
+poly1305_init_ext_avx2_7:
+movq $0, 176(%r10)
+vzeroupper
+popq %rbx
+popq %r15
+popq %r14
+popq %r13
+popq %r12
+ret
+FN_END poly1305_init_ext_avx2
+
--- /dev/null
+SECTION_RODATA
+
+.p2align 4
+poly1305_constants_x86:
+/* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000
+/* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000
+/* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000
+/* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000
+/* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000
+/* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000
+/* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000
+/* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000
+/* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000
+/* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000
+/* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000
+/* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000
+/* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe
+/* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001
+/* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001
+/* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001
+/* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003
+++ /dev/null
-/*
- poly1305 implementation using 16 bit * 16 bit = 32 bit multiplication and 32 bit addition
-*/
-
-#if defined(_MSC_VER)
- #define POLY1305_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__)
- #define POLY1305_NOINLINE __attribute__((noinline))
-#else
- #define POLY1305_NOINLINE
-#endif
-
-#define poly1305_block_size 16
-
-/* 17 + sizeof(size_t) + 18*sizeof(unsigned short) */
-typedef struct poly1305_state_internal_t {
- unsigned char buffer[poly1305_block_size];
- size_t leftover;
- unsigned short r[10];
- unsigned short h[10];
- unsigned short pad[8];
- unsigned char final;
-} poly1305_state_internal_t;
-
-/* interpret two 8 bit unsigned integers as a 16 bit unsigned integer in little endian */
-static unsigned short
-U8TO16(const unsigned char *p) {
- return
- (((unsigned short)(p[0] & 0xff) ) |
- ((unsigned short)(p[1] & 0xff) << 8));
-}
-
-/* store a 16 bit unsigned integer as two 8 bit unsigned integers in little endian */
-static void
-U16TO8(unsigned char *p, unsigned short v) {
- p[0] = (v ) & 0xff;
- p[1] = (v >> 8) & 0xff;
-}
-
-void
-poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
- size_t i;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- t0 = U8TO16(&key[ 0]); st->r[0] = ( t0 ) & 0x1fff;
- t1 = U8TO16(&key[ 2]); st->r[1] = ((t0 >> 13) | (t1 << 3)) & 0x1fff;
- t2 = U8TO16(&key[ 4]); st->r[2] = ((t1 >> 10) | (t2 << 6)) & 0x1f03;
- t3 = U8TO16(&key[ 6]); st->r[3] = ((t2 >> 7) | (t3 << 9)) & 0x1fff;
- t4 = U8TO16(&key[ 8]); st->r[4] = ((t3 >> 4) | (t4 << 12)) & 0x00ff;
- st->r[5] = ((t4 >> 1) ) & 0x1ffe;
- t5 = U8TO16(&key[10]); st->r[6] = ((t4 >> 14) | (t5 << 2)) & 0x1fff;
- t6 = U8TO16(&key[12]); st->r[7] = ((t5 >> 11) | (t6 << 5)) & 0x1f81;
- t7 = U8TO16(&key[14]); st->r[8] = ((t6 >> 8) | (t7 << 8)) & 0x1fff;
- st->r[9] = ((t7 >> 5) ) & 0x007f;
-
- /* h = 0 */
- for (i = 0; i < 10; i++)
- st->h[i] = 0;
-
- /* save pad for later */
- for (i = 0; i < 8; i++)
- st->pad[i] = U8TO16(&key[16 + (2 * i)]);
-
- st->leftover = 0;
- st->final = 0;
-}
-
-static void
-poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
- const unsigned short hibit = (st->final) ? 0 : (1 << 11); /* 1 << 128 */
- unsigned short t0,t1,t2,t3,t4,t5,t6,t7;
- unsigned long d[10];
- unsigned long c;
-
- while (bytes >= poly1305_block_size) {
- size_t i, j;
-
- /* h += m[i] */
- t0 = U8TO16(&m[ 0]); st->h[0] += ( t0 ) & 0x1fff;
- t1 = U8TO16(&m[ 2]); st->h[1] += ((t0 >> 13) | (t1 << 3)) & 0x1fff;
- t2 = U8TO16(&m[ 4]); st->h[2] += ((t1 >> 10) | (t2 << 6)) & 0x1fff;
- t3 = U8TO16(&m[ 6]); st->h[3] += ((t2 >> 7) | (t3 << 9)) & 0x1fff;
- t4 = U8TO16(&m[ 8]); st->h[4] += ((t3 >> 4) | (t4 << 12)) & 0x1fff;
- st->h[5] += ((t4 >> 1) ) & 0x1fff;
- t5 = U8TO16(&m[10]); st->h[6] += ((t4 >> 14) | (t5 << 2)) & 0x1fff;
- t6 = U8TO16(&m[12]); st->h[7] += ((t5 >> 11) | (t6 << 5)) & 0x1fff;
- t7 = U8TO16(&m[14]); st->h[8] += ((t6 >> 8) | (t7 << 8)) & 0x1fff;
- st->h[9] += ((t7 >> 5) ) | hibit;
-
- /* h *= r, (partial) h %= p */
- for (i = 0, c = 0; i < 10; i++) {
- d[i] = c;
- for (j = 0; j < 10; j++) {
- d[i] += (unsigned long)st->h[j] * ((j <= i) ? st->r[i - j] : (5 * st->r[i + 10 - j]));
- /* Sum(h[i] * r[i] * 5) will overflow slightly above 6 products with an unclamped r, so carry at 5 */
- if (j == 4) {
- c = (d[i] >> 13);
- d[i] &= 0x1fff;
- }
- }
- c += (d[i] >> 13);
- d[i] &= 0x1fff;
- }
- c = ((c << 2) + c); /* c *= 5 */
- c += d[0];
- d[0] = ((unsigned short)c & 0x1fff);
- c = (c >> 13);
- d[1] += c;
-
- for (i = 0; i < 10; i++)
- st->h[i] = (unsigned short)d[i];
-
- m += poly1305_block_size;
- bytes -= poly1305_block_size;
- }
-}
-
-POLY1305_NOINLINE void
-poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- unsigned short c;
- unsigned short g[10];
- unsigned short mask;
- unsigned long f;
- size_t i;
-
- /* process the remaining block */
- if (st->leftover) {
- size_t i = st->leftover;
- st->buffer[i++] = 1;
- for (; i < poly1305_block_size; i++)
- st->buffer[i] = 0;
- st->final = 1;
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- }
-
- /* fully carry h */
- c = st->h[1] >> 13;
- st->h[1] &= 0x1fff;
- for (i = 2; i < 10; i++) {
- st->h[i] += c;
- c = st->h[i] >> 13;
- st->h[i] &= 0x1fff;
- }
- st->h[0] += (c * 5);
- c = st->h[0] >> 13;
- st->h[0] &= 0x1fff;
- st->h[1] += c;
- c = st->h[1] >> 13;
- st->h[1] &= 0x1fff;
- st->h[2] += c;
-
- /* compute h + -p */
- g[0] = st->h[0] + 5;
- c = g[0] >> 13;
- g[0] &= 0x1fff;
- for (i = 1; i < 10; i++) {
- g[i] = st->h[i] + c;
- c = g[i] >> 13;
- g[i] &= 0x1fff;
- }
- g[9] -= (1 << 13);
-
- /* select h if h < p, or h + -p if h >= p */
- mask = (g[9] >> ((sizeof(unsigned short) * 8) - 1)) - 1;
- for (i = 0; i < 10; i++)
- g[i] &= mask;
- mask = ~mask;
- for (i = 0; i < 10; i++)
- st->h[i] = (st->h[i] & mask) | g[i];
-
- /* h = h % (2^128) */
- st->h[0] = ((st->h[0] ) | (st->h[1] << 13) ) & 0xffff;
- st->h[1] = ((st->h[1] >> 3) | (st->h[2] << 10) ) & 0xffff;
- st->h[2] = ((st->h[2] >> 6) | (st->h[3] << 7) ) & 0xffff;
- st->h[3] = ((st->h[3] >> 9) | (st->h[4] << 4) ) & 0xffff;
- st->h[4] = ((st->h[4] >> 12) | (st->h[5] << 1) | (st->h[6] << 14)) & 0xffff;
- st->h[5] = ((st->h[6] >> 2) | (st->h[7] << 11) ) & 0xffff;
- st->h[6] = ((st->h[7] >> 5) | (st->h[8] << 8) ) & 0xffff;
- st->h[7] = ((st->h[8] >> 8) | (st->h[9] << 5) ) & 0xffff;
-
- /* mac = (h + pad) % (2^128) */
- f = (unsigned long)st->h[0] + st->pad[0];
- st->h[0] = (unsigned short)f;
- for (i = 1; i < 8; i++) {
- f = (unsigned long)st->h[i] + st->pad[i] + (f >> 16);
- st->h[i] = (unsigned short)f;
- }
-
- for (i = 0; i < 8; i++)
- U16TO8(mac + (i * 2), st->h[i]);
-
- /* zero out the state */
- for (i = 0; i < 10; i++)
- st->h[i] = 0;
- for (i = 0; i < 10; i++)
- st->r[i] = 0;
- for (i = 0; i < 8; i++)
- st->pad[i] = 0;
-}
+++ /dev/null
-/*
- poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
-*/
-
-#if defined(_MSC_VER)
- #define POLY1305_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__)
- #define POLY1305_NOINLINE __attribute__((noinline))
-#else
- #define POLY1305_NOINLINE
-#endif
-
-#define poly1305_block_size 16
-
-/* 17 + sizeof(size_t) + 14*sizeof(unsigned long) */
-typedef struct poly1305_state_internal_t {
- unsigned long r[5];
- unsigned long h[5];
- unsigned long pad[4];
- size_t leftover;
- unsigned char buffer[poly1305_block_size];
- unsigned char final;
-} poly1305_state_internal_t;
-
-/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
-static unsigned long
-U8TO32(const unsigned char *p) {
- return
- (((unsigned long)(p[0] & 0xff) ) |
- ((unsigned long)(p[1] & 0xff) << 8) |
- ((unsigned long)(p[2] & 0xff) << 16) |
- ((unsigned long)(p[3] & 0xff) << 24));
-}
-
-/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
-static void
-U32TO8(unsigned char *p, unsigned long v) {
- p[0] = (v ) & 0xff;
- p[1] = (v >> 8) & 0xff;
- p[2] = (v >> 16) & 0xff;
- p[3] = (v >> 24) & 0xff;
-}
-
-void
-poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- st->r[0] = (U8TO32(&key[ 0]) ) & 0x3ffffff;
- st->r[1] = (U8TO32(&key[ 3]) >> 2) & 0x3ffff03;
- st->r[2] = (U8TO32(&key[ 6]) >> 4) & 0x3ffc0ff;
- st->r[3] = (U8TO32(&key[ 9]) >> 6) & 0x3f03fff;
- st->r[4] = (U8TO32(&key[12]) >> 8) & 0x00fffff;
-
- /* h = 0 */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
- st->h[3] = 0;
- st->h[4] = 0;
-
- /* save pad for later */
- st->pad[0] = U8TO32(&key[16]);
- st->pad[1] = U8TO32(&key[20]);
- st->pad[2] = U8TO32(&key[24]);
- st->pad[3] = U8TO32(&key[28]);
-
- st->leftover = 0;
- st->final = 0;
-}
-
-static void
-poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
- const unsigned long hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
- unsigned long r0,r1,r2,r3,r4;
- unsigned long s1,s2,s3,s4;
- unsigned long h0,h1,h2,h3,h4;
- unsigned long long d0,d1,d2,d3,d4;
- unsigned long c;
-
- r0 = st->r[0];
- r1 = st->r[1];
- r2 = st->r[2];
- r3 = st->r[3];
- r4 = st->r[4];
-
- s1 = r1 * 5;
- s2 = r2 * 5;
- s3 = r3 * 5;
- s4 = r4 * 5;
-
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
- h3 = st->h[3];
- h4 = st->h[4];
-
- while (bytes >= poly1305_block_size) {
- /* h += m[i] */
- h0 += (U8TO32(m+ 0) ) & 0x3ffffff;
- h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
- h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
- h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
- h4 += (U8TO32(m+12) >> 8) | hibit;
-
- /* h *= r */
- d0 = ((unsigned long long)h0 * r0) + ((unsigned long long)h1 * s4) + ((unsigned long long)h2 * s3) + ((unsigned long long)h3 * s2) + ((unsigned long long)h4 * s1);
- d1 = ((unsigned long long)h0 * r1) + ((unsigned long long)h1 * r0) + ((unsigned long long)h2 * s4) + ((unsigned long long)h3 * s3) + ((unsigned long long)h4 * s2);
- d2 = ((unsigned long long)h0 * r2) + ((unsigned long long)h1 * r1) + ((unsigned long long)h2 * r0) + ((unsigned long long)h3 * s4) + ((unsigned long long)h4 * s3);
- d3 = ((unsigned long long)h0 * r3) + ((unsigned long long)h1 * r2) + ((unsigned long long)h2 * r1) + ((unsigned long long)h3 * r0) + ((unsigned long long)h4 * s4);
- d4 = ((unsigned long long)h0 * r4) + ((unsigned long long)h1 * r3) + ((unsigned long long)h2 * r2) + ((unsigned long long)h3 * r1) + ((unsigned long long)h4 * r0);
-
- /* (partial) h %= p */
- c = (unsigned long)(d0 >> 26); h0 = (unsigned long)d0 & 0x3ffffff;
- d1 += c; c = (unsigned long)(d1 >> 26); h1 = (unsigned long)d1 & 0x3ffffff;
- d2 += c; c = (unsigned long)(d2 >> 26); h2 = (unsigned long)d2 & 0x3ffffff;
- d3 += c; c = (unsigned long)(d3 >> 26); h3 = (unsigned long)d3 & 0x3ffffff;
- d4 += c; c = (unsigned long)(d4 >> 26); h4 = (unsigned long)d4 & 0x3ffffff;
- h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
- h1 += c;
-
- m += poly1305_block_size;
- bytes -= poly1305_block_size;
- }
-
- st->h[0] = h0;
- st->h[1] = h1;
- st->h[2] = h2;
- st->h[3] = h3;
- st->h[4] = h4;
-}
-
-POLY1305_NOINLINE void
-poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- unsigned long h0,h1,h2,h3,h4,c;
- unsigned long g0,g1,g2,g3,g4;
- unsigned long long f;
- unsigned long mask;
-
- /* process the remaining block */
- if (st->leftover) {
- size_t i = st->leftover;
- st->buffer[i++] = 1;
- for (; i < poly1305_block_size; i++)
- st->buffer[i] = 0;
- st->final = 1;
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- }
-
- /* fully carry h */
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
- h3 = st->h[3];
- h4 = st->h[4];
-
- c = h1 >> 26; h1 = h1 & 0x3ffffff;
- h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
- h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
- h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
- h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
- h1 += c;
-
- /* compute h + -p */
- g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
- g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
- g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
- g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
- g4 = h4 + c - (1 << 26);
-
- /* select h if h < p, or h + -p if h >= p */
- mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
- g0 &= mask;
- g1 &= mask;
- g2 &= mask;
- g3 &= mask;
- g4 &= mask;
- mask = ~mask;
- h0 = (h0 & mask) | g0;
- h1 = (h1 & mask) | g1;
- h2 = (h2 & mask) | g2;
- h3 = (h3 & mask) | g3;
- h4 = (h4 & mask) | g4;
-
- /* h = h % (2^128) */
- h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
- h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
- h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
- h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
-
- /* mac = (h + pad) % (2^128) */
- f = (unsigned long long)h0 + st->pad[0] ; h0 = (unsigned long)f;
- f = (unsigned long long)h1 + st->pad[1] + (f >> 32); h1 = (unsigned long)f;
- f = (unsigned long long)h2 + st->pad[2] + (f >> 32); h2 = (unsigned long)f;
- f = (unsigned long long)h3 + st->pad[3] + (f >> 32); h3 = (unsigned long)f;
-
- U32TO8(mac + 0, h0);
- U32TO8(mac + 4, h1);
- U32TO8(mac + 8, h2);
- U32TO8(mac + 12, h3);
-
- /* zero out the state */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
- st->h[3] = 0;
- st->h[4] = 0;
- st->r[0] = 0;
- st->r[1] = 0;
- st->r[2] = 0;
- st->r[3] = 0;
- st->r[4] = 0;
- st->pad[0] = 0;
- st->pad[1] = 0;
- st->pad[2] = 0;
- st->pad[3] = 0;
-}
-
+++ /dev/null
-/*
- poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
-*/
-
-#if defined(_MSC_VER)
- #include <intrin.h>
-
- typedef struct uint128_t {
- unsigned long long lo;
- unsigned long long hi;
- } uint128_t;
-
- #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
- #define ADD(out, in) { unsigned long long t = out.lo; out.lo += in.lo; out.hi += (out.lo < t) + in.hi; }
- #define ADDLO(out, in) { unsigned long long t = out.lo; out.lo += in; out.hi += (out.lo < t); }
- #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
- #define LO(in) (in.lo)
-
- #define POLY1305_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__)
- #if defined(__SIZEOF_INT128__)
- typedef unsigned __int128 uint128_t;
- #else
- typedef unsigned uint128_t __attribute__((mode(TI)));
- #endif
-
- #define MUL(out, x, y) out = ((uint128_t)x * y)
- #define ADD(out, in) out += in
- #define ADDLO(out, in) out += in
- #define SHR(in, shift) (unsigned long long)(in >> (shift))
- #define LO(in) (unsigned long long)(in)
-
- #define POLY1305_NOINLINE __attribute__((noinline))
-#endif
-
-#define poly1305_block_size 16
-
-/* 17 + sizeof(size_t) + 8*sizeof(unsigned long long) */
-typedef struct poly1305_state_internal_t {
- unsigned long long r[3];
- unsigned long long h[3];
- unsigned long long pad[2];
- size_t leftover;
- unsigned char buffer[poly1305_block_size];
- unsigned char final;
-} poly1305_state_internal_t;
-
-/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
-static unsigned long long
-U8TO64(const unsigned char *p) {
- return
- (((unsigned long long)(p[0] & 0xff) ) |
- ((unsigned long long)(p[1] & 0xff) << 8) |
- ((unsigned long long)(p[2] & 0xff) << 16) |
- ((unsigned long long)(p[3] & 0xff) << 24) |
- ((unsigned long long)(p[4] & 0xff) << 32) |
- ((unsigned long long)(p[5] & 0xff) << 40) |
- ((unsigned long long)(p[6] & 0xff) << 48) |
- ((unsigned long long)(p[7] & 0xff) << 56));
-}
-
-/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
-static void
-U64TO8(unsigned char *p, unsigned long long v) {
- p[0] = (v ) & 0xff;
- p[1] = (v >> 8) & 0xff;
- p[2] = (v >> 16) & 0xff;
- p[3] = (v >> 24) & 0xff;
- p[4] = (v >> 32) & 0xff;
- p[5] = (v >> 40) & 0xff;
- p[6] = (v >> 48) & 0xff;
- p[7] = (v >> 56) & 0xff;
-}
-
-void
-poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- unsigned long long t0,t1;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- t0 = U8TO64(&key[0]);
- t1 = U8TO64(&key[8]);
-
- st->r[0] = ( t0 ) & 0xffc0fffffff;
- st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
- st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
-
- /* h = 0 */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
-
- /* save pad for later */
- st->pad[0] = U8TO64(&key[16]);
- st->pad[1] = U8TO64(&key[24]);
-
- st->leftover = 0;
- st->final = 0;
-}
-
-static void
-poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
- const unsigned long long hibit = (st->final) ? 0 : ((unsigned long long)1 << 40); /* 1 << 128 */
- unsigned long long r0,r1,r2;
- unsigned long long s1,s2;
- unsigned long long h0,h1,h2;
- unsigned long long c;
- uint128_t d0,d1,d2,d;
-
- r0 = st->r[0];
- r1 = st->r[1];
- r2 = st->r[2];
-
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
-
- s1 = r1 * (5 << 2);
- s2 = r2 * (5 << 2);
-
- while (bytes >= poly1305_block_size) {
- unsigned long long t0,t1;
-
- /* h += m[i] */
- t0 = U8TO64(&m[0]);
- t1 = U8TO64(&m[8]);
-
- h0 += (( t0 ) & 0xfffffffffff);
- h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
- h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
-
- /* h *= r */
- MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
- MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
- MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
-
- /* (partial) h %= p */
- c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
- ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
- ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
- h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
- h1 += c;
-
- m += poly1305_block_size;
- bytes -= poly1305_block_size;
- }
-
- st->h[0] = h0;
- st->h[1] = h1;
- st->h[2] = h2;
-}
-
-
-POLY1305_NOINLINE void
-poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- unsigned long long h0,h1,h2,c;
- unsigned long long g0,g1,g2;
- unsigned long long t0,t1;
-
- /* process the remaining block */
- if (st->leftover) {
- size_t i = st->leftover;
- st->buffer[i] = 1;
- for (i = i + 1; i < poly1305_block_size; i++)
- st->buffer[i] = 0;
- st->final = 1;
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- }
-
- /* fully carry h */
- h0 = st->h[0];
- h1 = st->h[1];
- h2 = st->h[2];
-
- c = (h1 >> 44); h1 &= 0xfffffffffff;
- h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
- h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
- h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
- h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
- h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
- h1 += c;
-
- /* compute h + -p */
- g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
- g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
- g2 = h2 + c - ((unsigned long long)1 << 42);
-
- /* select h if h < p, or h + -p if h >= p */
- c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
- g0 &= c;
- g1 &= c;
- g2 &= c;
- c = ~c;
- h0 = (h0 & c) | g0;
- h1 = (h1 & c) | g1;
- h2 = (h2 & c) | g2;
-
- /* h = (h + pad) */
- t0 = st->pad[0];
- t1 = st->pad[1];
-
- h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
- h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
- h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;
-
- /* mac = h % (2^128) */
- h0 = ((h0 ) | (h1 << 44));
- h1 = ((h1 >> 20) | (h2 << 24));
-
- U64TO8(&mac[0], h0);
- U64TO8(&mac[8], h1);
-
- /* zero out the state */
- st->h[0] = 0;
- st->h[1] = 0;
- st->h[2] = 0;
- st->r[0] = 0;
- st->r[1] = 0;
- st->r[2] = 0;
- st->pad[0] = 0;
- st->pad[1] = 0;
-}
-
+++ /dev/null
-/*
- poly1305 implementation using 8 bit * 8 bit = 16 bit multiplication and 32 bit addition
-
- based on the public domain reference version in supercop by djb
-*/
-
-#if defined(_MSC_VER)
- #define POLY1305_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__)
- #define POLY1305_NOINLINE __attribute__((noinline))
-#else
- #define POLY1305_NOINLINE
-#endif
-
-#define poly1305_block_size 16
-
-/* 17 + sizeof(size_t) + 51*sizeof(unsigned char) */
-typedef struct poly1305_state_internal_t {
- unsigned char buffer[poly1305_block_size];
- size_t leftover;
- unsigned char h[17];
- unsigned char r[17];
- unsigned char pad[17];
- unsigned char final;
-} poly1305_state_internal_t;
-
-void
-poly1305_init(poly1305_context *ctx, const unsigned char key[32]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- size_t i;
-
- st->leftover = 0;
-
- /* h = 0 */
- for (i = 0; i < 17; i++)
- st->h[i] = 0;
-
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- st->r[ 0] = key[ 0] & 0xff;
- st->r[ 1] = key[ 1] & 0xff;
- st->r[ 2] = key[ 2] & 0xff;
- st->r[ 3] = key[ 3] & 0x0f;
- st->r[ 4] = key[ 4] & 0xfc;
- st->r[ 5] = key[ 5] & 0xff;
- st->r[ 6] = key[ 6] & 0xff;
- st->r[ 7] = key[ 7] & 0x0f;
- st->r[ 8] = key[ 8] & 0xfc;
- st->r[ 9] = key[ 9] & 0xff;
- st->r[10] = key[10] & 0xff;
- st->r[11] = key[11] & 0x0f;
- st->r[12] = key[12] & 0xfc;
- st->r[13] = key[13] & 0xff;
- st->r[14] = key[14] & 0xff;
- st->r[15] = key[15] & 0x0f;
- st->r[16] = 0;
-
- /* save pad for later */
- for (i = 0; i < 16; i++)
- st->pad[i] = key[i + 16];
- st->pad[16] = 0;
-
- st->final = 0;
-}
-
-static void
-poly1305_add(unsigned char h[17], const unsigned char c[17]) {
- unsigned short u;
- unsigned int i;
- for (u = 0, i = 0; i < 17; i++) {
- u += (unsigned short)h[i] + (unsigned short)c[i];
- h[i] = (unsigned char)u & 0xff;
- u >>= 8;
- }
-}
-
-static void
-poly1305_squeeze(unsigned char h[17], unsigned long hr[17]) {
- unsigned long u;
- unsigned int i;
- u = 0;
- for (i = 0; i < 16; i++) {
- u += hr[i];
- h[i] = (unsigned char)u & 0xff;
- u >>= 8;
- }
- u += hr[16];
- h[16] = (unsigned char)u & 0x03;
- u >>= 2;
- u += (u << 2); /* u *= 5; */
- for (i = 0; i < 16; i++) {
- u += h[i];
- h[i] = (unsigned char)u & 0xff;
- u >>= 8;
- }
- h[16] += (unsigned char)u;
-}
-
-static void
-poly1305_freeze(unsigned char h[17]) {
- static const unsigned char minusp[17] = {
- 0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0xfc
- };
- unsigned char horig[17], negative;
- unsigned int i;
-
- /* compute h + -p */
- for (i = 0; i < 17; i++)
- horig[i] = h[i];
- poly1305_add(h, minusp);
-
- /* select h if h < p, or h + -p if h >= p */
- negative = -(h[16] >> 7);
- for (i = 0; i < 17; i++)
- h[i] ^= negative & (horig[i] ^ h[i]);
-}
-
-static void
-poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, size_t bytes) {
- const unsigned char hibit = st->final ^ 1; /* 1 << 128 */
-
- while (bytes >= poly1305_block_size) {
- unsigned long hr[17], u;
- unsigned char c[17];
- unsigned int i, j;
-
- /* h += m */
- for (i = 0; i < 16; i++)
- c[i] = m[i];
- c[16] = hibit;
- poly1305_add(st->h, c);
-
- /* h *= r */
- for (i = 0; i < 17; i++) {
- u = 0;
- for (j = 0; j <= i ; j++) {
- u += (unsigned short)st->h[j] * st->r[i - j];
- }
- for (j = i + 1; j < 17; j++) {
- unsigned long v = (unsigned short)st->h[j] * st->r[i + 17 - j];
- v = ((v << 8) + (v << 6)); /* v *= (5 << 6); */
- u += v;
- }
- hr[i] = u;
- }
-
- /* (partial) h %= p */
- poly1305_squeeze(st->h, hr);
-
- m += poly1305_block_size;
- bytes -= poly1305_block_size;
- }
-}
-
-POLY1305_NOINLINE void
-poly1305_finish(poly1305_context *ctx, unsigned char mac[16]) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- size_t i;
-
- /* process the remaining block */
- if (st->leftover) {
- size_t i = st->leftover;
- st->buffer[i++] = 1;
- for (; i < poly1305_block_size; i++)
- st->buffer[i] = 0;
- st->final = 1;
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- }
-
- /* fully reduce h */
- poly1305_freeze(st->h);
-
- /* h = (h + pad) % (1 << 128) */
- poly1305_add(st->h, st->pad);
- for (i = 0; i < 16; i++)
- mac[i] = st->h[i];
-
- /* zero out the state */
- for (i = 0; i < 17; i++)
- st->h[i] = 0;
- for (i = 0; i < 17; i++)
- st->r[i] = 0;
- for (i = 0; i < 17; i++)
- st->pad[i] = 0;
-}
+++ /dev/null
-#include "poly1305-donna.h"
-
-#if defined(POLY1305_8BIT)
-#include "poly1305-donna-8.h"
-#elif defined(POLY1305_16BIT)
-#include "poly1305-donna-16.h"
-#elif defined(POLY1305_32BIT)
-#include "poly1305-donna-32.h"
-#elif defined(POLY1305_64BIT)
-#include "poly1305-donna-64.h"
-#else
-
-/* auto detect between 32bit / 64bit */
-#define HAS_SIZEOF_INT128_64BIT (defined(__SIZEOF_INT128__) && defined(__LP64__))
-#define HAS_MSVC_64BIT (defined(_MSC_VER) && defined(_M_X64))
-#define HAS_GCC_4_4_64BIT (defined(__GNUC__) && defined(__LP64__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4))))
-
-#if (HAS_SIZEOF_INT128_64BIT || HAS_MSVC_64BIT || HAS_GCC_4_4_64BIT)
-#include "poly1305-donna-64.h"
-#else
-#include "poly1305-donna-32.h"
-#endif
-
-#endif
-
-void
-poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes) {
- poly1305_state_internal_t *st = (poly1305_state_internal_t *)ctx;
- size_t i;
-
- /* handle leftover */
- if (st->leftover) {
- size_t want = (poly1305_block_size - st->leftover);
- if (want > bytes)
- want = bytes;
- for (i = 0; i < want; i++)
- st->buffer[st->leftover + i] = m[i];
- bytes -= want;
- m += want;
- st->leftover += want;
- if (st->leftover < poly1305_block_size)
- return;
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- st->leftover = 0;
- }
-
- /* process full blocks */
- if (bytes >= poly1305_block_size) {
- size_t want = (bytes & ~(poly1305_block_size - 1));
- poly1305_blocks(st, m, want);
- m += want;
- bytes -= want;
- }
-
- /* store leftover */
- if (bytes) {
- for (i = 0; i < bytes; i++)
- st->buffer[st->leftover + i] = m[i];
- st->leftover += bytes;
- }
-}
-
-void
-poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]) {
- poly1305_context ctx;
- poly1305_init(&ctx, key);
- poly1305_update(&ctx, m, bytes);
- poly1305_finish(&ctx, mac);
-}
-
-int
-poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) {
- size_t i;
- unsigned int dif = 0;
- for (i = 0; i < 16; i++)
- dif |= (mac1[i] ^ mac2[i]);
- dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
- return (dif & 1);
-}
-
-
-/* test a few basic operations */
-int
-poly1305_power_on_self_test(void) {
- /* example from nacl */
- static const unsigned char nacl_key[32] = {
- 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
- 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
- 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
- 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
- };
-
- static const unsigned char nacl_msg[131] = {
- 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
- 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
- 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
- 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
- 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
- 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
- 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
- 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
- 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
- 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
- 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
- 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
- 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
- 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
- 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
- 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
- 0xe3,0x55,0xa5
- };
-
- static const unsigned char nacl_mac[16] = {
- 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
- 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9
- };
-
- /* generates a final value of (2^130 - 2) == 3 */
- static const unsigned char wrap_key[32] = {
- 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- };
-
- static const unsigned char wrap_msg[16] = {
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- };
-
- static const unsigned char wrap_mac[16] = {
- 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- };
-
- /*
- mac of the macs of messages of length 0 to 256, where the key and messages
- have all their values set to the length
- */
- static const unsigned char total_key[32] = {
- 0x01,0x02,0x03,0x04,0x05,0x06,0x07,
- 0xff,0xfe,0xfd,0xfc,0xfb,0xfa,0xf9,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff
- };
-
- static const unsigned char total_mac[16] = {
- 0x64,0xaf,0xe2,0xe8,0xd6,0xad,0x7b,0xbd,
- 0xd2,0x87,0xf9,0x7c,0x44,0x62,0x3d,0x39
- };
-
- poly1305_context ctx;
- poly1305_context total_ctx;
- unsigned char all_key[32];
- unsigned char all_msg[256];
- unsigned char mac[16];
- size_t i, j;
- int result = 1;
-
- for (i = 0; i < sizeof(mac); i++)
- mac[i] = 0;
- poly1305_auth(mac, nacl_msg, sizeof(nacl_msg), nacl_key);
- result &= poly1305_verify(nacl_mac, mac);
-
- for (i = 0; i < sizeof(mac); i++)
- mac[i] = 0;
- poly1305_init(&ctx, nacl_key);
- poly1305_update(&ctx, nacl_msg + 0, 32);
- poly1305_update(&ctx, nacl_msg + 32, 64);
- poly1305_update(&ctx, nacl_msg + 96, 16);
- poly1305_update(&ctx, nacl_msg + 112, 8);
- poly1305_update(&ctx, nacl_msg + 120, 4);
- poly1305_update(&ctx, nacl_msg + 124, 2);
- poly1305_update(&ctx, nacl_msg + 126, 1);
- poly1305_update(&ctx, nacl_msg + 127, 1);
- poly1305_update(&ctx, nacl_msg + 128, 1);
- poly1305_update(&ctx, nacl_msg + 129, 1);
- poly1305_update(&ctx, nacl_msg + 130, 1);
- poly1305_finish(&ctx, mac);
- result &= poly1305_verify(nacl_mac, mac);
-
- for (i = 0; i < sizeof(mac); i++)
- mac[i] = 0;
- poly1305_auth(mac, wrap_msg, sizeof(wrap_msg), wrap_key);
- result &= poly1305_verify(wrap_mac, mac);
-
- poly1305_init(&total_ctx, total_key);
- for (i = 0; i < 256; i++) {
- /* set key and message to 'i,i,i..' */
- for (j = 0; j < sizeof(all_key); j++)
- all_key[j] = i;
- for (j = 0; j < i; j++)
- all_msg[j] = i;
- poly1305_auth(mac, all_msg, i, all_key);
- poly1305_update(&total_ctx, mac, 16);
- }
- poly1305_finish(&total_ctx, mac);
- result &= poly1305_verify(total_mac, mac);
-
- return result;
-}
+++ /dev/null
-#ifndef POLY1305_DONNA_H
-#define POLY1305_DONNA_H
-
-#include <stddef.h>
-
-typedef struct poly1305_context {
- size_t aligner;
- unsigned char opaque[136];
-} poly1305_context;
-
-void poly1305_init(poly1305_context *ctx, const unsigned char key[32]);
-void poly1305_update(poly1305_context *ctx, const unsigned char *m, size_t bytes);
-void poly1305_finish(poly1305_context *ctx, unsigned char mac[16]);
-void poly1305_auth(unsigned char mac[16], const unsigned char *m, size_t bytes, const unsigned char key[32]);
-
-int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);
-int poly1305_power_on_self_test(void);
-
-#endif /* POLY1305_DONNA_H */
-
--- /dev/null
+/*
+ * Copyright (c) 2015, Vsevolod Stakhov
+ * Copyright (c) 2015, Andrew Moon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "cryptobox.h"
+#include "poly1305.h"
+#include "platform_config.h"
+
+extern unsigned long cpu_config;
+
+typedef struct poly1305_state_internal_t
+{
+ unsigned char opaque[192]; /* largest state required (AVX2) */
+ size_t leftover, block_size;
+ unsigned char buffer[64]; /* largest blocksize (AVX2) */
+} poly1305_state_internal;
+
+typedef struct poly1305_impl_t
+{
+ unsigned long cpu_flags;
+ const char *desc;
+
+ size_t (*block_size)(void);
+ void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint);
+ void (*blocks)(void *state, const unsigned char *in, size_t inlen);
+ void (*finish_ext)(void *state, const unsigned char *in, size_t remaining,
+ unsigned char *mac);
+ void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen,
+ const poly1305_key *key);
+} poly1305_impl_t;
+
+#define POLY1305_DECLARE(ext) \
+ size_t poly1305_block_size_##ext(void); \
+ void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \
+ void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \
+ void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \
+ void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key);
+
+#define POLY1305_IMPL(cpuflags, desc, ext) \
+ {(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext}
+
+#if defined(HAVE_AVX2)
+POLY1305_DECLARE(avx2)
+#define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2)
+#endif
+#if defined(HAVE_AVX)
+POLY1305_DECLARE(avx)
+#define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx)
+#endif
+#if defined(HAVE_SSE2)
+POLY1305_DECLARE(sse2)
+#define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2)
+#endif
+
+POLY1305_DECLARE(ref)
+#define POLY1305_GENERIC POLY1305_IMPL(0, "generic", ref)
+
+/* list implemenations from most optimized to least, with generic as the last entry */
+static const poly1305_impl_t poly1305_list[] =
+{
+POLY1305_GENERIC,
+
+#if defined(POLY1305_AVX2)
+ POLY1305_AVX2,
+#endif
+#if defined(POLY1305_AVX)
+ POLY1305_AVX,
+#endif
+#if defined(POLY1305_SSE2)
+ POLY1305_SSE2,
+#endif
+ };
+
+static const poly1305_impl_t *poly1305_opt = &poly1305_list[0];
+;
+
+/* is the pointer aligned on a word boundary? */
+static int poly1305_is_aligned(const void *p)
+{
+ return ((size_t) p & (sizeof(size_t) - 1)) == 0;
+}
+
+void poly1305_load(void)
+{
+ guint i;
+
+ if (cpu_config != 0) {
+ for (i = 0; i < G_N_ELEMENTS(poly1305_list); i++) {
+ if (poly1305_list[i].cpu_flags & cpu_config) {
+ poly1305_opt = &poly1305_list[i];
+ break;
+ }
+ }
+ }
+}
+
+/* processes inlen bytes (full blocks only), handling input alignment */
+static void poly1305_consume(poly1305_state_internal *state,
+ const unsigned char *in, size_t inlen)
+{
+ int in_aligned;
+
+ /* it's ok to call with 0 bytes */
+ if (!inlen)
+ return;
+
+ /* if everything is aligned, handle directly */
+ in_aligned = poly1305_is_aligned (in);
+ if (in_aligned) {
+ poly1305_opt->blocks (state->opaque, in, inlen);
+ return;
+ }
+
+ /* copy the unaligned data to an aligned buffer and process in chunks */
+ while (inlen) {
+ unsigned char buffer[1024];
+ const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen;
+ memcpy (buffer, in, bytes);
+ poly1305_opt->blocks (state->opaque, buffer, bytes);
+ in += bytes;
+ inlen -= bytes;
+ }
+}
+
+void poly1305_init(poly1305_state *S, const poly1305_key *key)
+{
+ poly1305_state_internal *state = (poly1305_state_internal *) S;
+ poly1305_opt->init_ext (state->opaque, key, 0);
+ state->leftover = 0;
+ state->block_size = poly1305_opt->block_size ();
+}
+
+void poly1305_init_ext(poly1305_state *S, const poly1305_key *key,
+ size_t bytes_hint)
+{
+ poly1305_state_internal *state = (poly1305_state_internal *) S;
+ poly1305_opt->init_ext (state->opaque, key, bytes_hint);
+ state->leftover = 0;
+ state->block_size = poly1305_opt->block_size ();
+}
+
+void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen)
+{
+ poly1305_state_internal *state = (poly1305_state_internal *) S;
+
+ /* handle leftover */
+ if (state->leftover) {
+ size_t want = (state->block_size - state->leftover);
+ if (want > inlen)
+ want = inlen;
+ memcpy (state->buffer + state->leftover, in, want);
+ inlen -= want;
+ in += want;
+ state->leftover += want;
+ if (state->leftover < state->block_size)
+ return;
+ poly1305_opt->blocks (state->opaque, state->buffer, state->block_size);
+ state->leftover = 0;
+ }
+
+ /* process full blocks */
+ if (inlen >= state->block_size) {
+ size_t want = (inlen & ~(state->block_size - 1));
+ poly1305_consume (state, in, want);
+ in += want;
+ inlen -= want;
+ }
+
+ /* store leftover */
+ if (inlen) {
+ memcpy (state->buffer + state->leftover, in, inlen);
+ state->leftover += inlen;
+ }
+}
+
+void poly1305_finish(poly1305_state *S, unsigned char *mac)
+{
+ poly1305_state_internal *state = (poly1305_state_internal *) S;
+ poly1305_opt->finish_ext (state->opaque, state->buffer, state->leftover,
+ mac);
+}
+
+void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen,
+ const poly1305_key *key)
+{
+ poly1305_opt->auth (mac, in, inlen, key);
+}
+
+int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16])
+{
+ size_t i;
+ unsigned int dif = 0;
+
+ for (i = 0; i < 16; i++) {
+ dif |= (mac1[i] ^ mac2[i]);
+ }
+
+ dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1);
+ return (dif & 1);
+}
--- /dev/null
+#ifndef POLY1305_H
+#define POLY1305_H
+
+#include <stddef.h>
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+typedef struct poly1305_state
+{
+ unsigned char opaque[320];
+} poly1305_state;
+
+typedef struct poly1305_key
+{
+ unsigned char b[32];
+} poly1305_key;
+
+void poly1305_init(poly1305_state *S, const poly1305_key *key);
+void poly1305_init_ext(poly1305_state *S, const poly1305_key *key,
+ size_t bytes_hint);
+void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen);
+void poly1305_finish(poly1305_state *S, unsigned char *mac);
+
+void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen,
+ const poly1305_key *key);
+int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]);
+
+void poly1305_load(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* POLY1305_H */
+
--- /dev/null
+/*
+ poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition
+
+ assumes the existence of uint32_t and uint64_t
+*/
+
+#include "config.h"
+
+enum {
+ POLY1305_BLOCK_SIZE = 16
+};
+
+typedef struct poly1305_state_ref_t {
+ uint32_t r[5];
+ uint32_t h[5];
+ uint32_t pad[4];
+ unsigned char final;
+} poly1305_state_ref_t;
+
+/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
+static uint32_t
+U8TO32(const unsigned char *p) {
+ return
+ (((uint32_t)(p[0] & 0xff) ) |
+ ((uint32_t)(p[1] & 0xff) << 8) |
+ ((uint32_t)(p[2] & 0xff) << 16) |
+ ((uint32_t)(p[3] & 0xff) << 24));
+}
+
+/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
+static void
+U32TO8(unsigned char *p, uint32_t v) {
+ p[0] = (unsigned char)((v ) & 0xff);
+ p[1] = (unsigned char)((v >> 8) & 0xff);
+ p[2] = (unsigned char)((v >> 16) & 0xff);
+ p[3] = (unsigned char)((v >> 24) & 0xff);
+}
+
+static size_t
+poly1305_block_size_ref(void) {
+ return POLY1305_BLOCK_SIZE;
+}
+
+static void
+poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+
+ /* bytes_hint not used */
+ (void)bytes_hint;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff;
+ st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03;
+ st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff;
+ st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff;
+ st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
+ /* save pad for later */
+ st->pad[0] = U8TO32(&key->b[16]);
+ st->pad[1] = U8TO32(&key->b[20]);
+ st->pad[2] = U8TO32(&key->b[24]);
+ st->pad[3] = U8TO32(&key->b[28]);
+
+ st->final = 0;
+}
+
+static void
+poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+ const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */
+ uint32_t r0,r1,r2,r3,r4;
+ uint32_t s1,s2,s3,s4;
+ uint32_t h0,h1,h2,h3,h4;
+ uint64_t d0,d1,d2,d3,d4;
+ uint32_t c;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+ r4 = st->r[4];
+
+ s1 = r1 * 5;
+ s2 = r2 * 5;
+ s3 = r3 * 5;
+ s4 = r4 * 5;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (inlen >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 += (U8TO32(in+ 0) ) & 0x3ffffff;
+ h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff;
+ h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff;
+ h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff;
+ h4 += (U8TO32(in+12) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1);
+ d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2);
+ d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3);
+ d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4);
+ d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0);
+
+ /* (partial) h %= p */
+ c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff;
+ d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff;
+ d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff;
+ d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff;
+ d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff;
+ h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ in += POLY1305_BLOCK_SIZE;
+ inlen -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+static void
+poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+ uint32_t h0,h1,h2,h3,h4,c;
+ uint32_t g0,g1,g2,g3,g4;
+ uint64_t f;
+ uint32_t mask;
+
+ /* process the remaining block */
+ if (remaining) {
+ unsigned char final[POLY1305_BLOCK_SIZE] = {0};
+ size_t i;
+ for (i = 0; i < remaining; i++)
+ final[i] = in[i];
+ final[remaining] = 1;
+ st->final = 1;
+ poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26; h1 = h1 & 0x3ffffff;
+ h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
+ h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
+ h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
+ h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
+ g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
+ g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
+ g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
+ g4 = h4 + c - (1 << 26);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1;
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ g4 &= mask;
+ mask = ~mask;
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+ h4 = (h4 & mask) | g4;
+
+ /* h = h % (2^128) */
+ h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
+ h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+ h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+ h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+ /* mac = (h + pad) % (2^128) */
+ f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f;
+ f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f;
+ f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f;
+ f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f;
+
+ U32TO8(mac + 0, h0);
+ U32TO8(mac + 4, h1);
+ U32TO8(mac + 8, h2);
+ U32TO8(mac + 12, h3);
+
+ /* zero out the state */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+ st->r[0] = 0;
+ st->r[1] = 0;
+ st->r[2] = 0;
+ st->r[3] = 0;
+ st->r[4] = 0;
+ st->pad[0] = 0;
+ st->pad[1] = 0;
+ st->pad[2] = 0;
+ st->pad[3] = 0;
+}
+
+static void
+poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
+ poly1305_state_ref_t st;
+ size_t blocks;
+ poly1305_init_ext_ref(&st, key, inlen);
+ blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
+ if (blocks) {
+ poly1305_blocks_ref(&st, in, blocks);
+ in += blocks;
+ inlen -= blocks;
+ }
+ poly1305_finish_ext_ref(&st, in, inlen, mac);
+}
+
--- /dev/null
+/*
+ poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition
+
+ assumes the existence of uint64_t and uint128_t
+*/
+
+#include "config.h"
+enum {
+ POLY1305_BLOCK_SIZE = 16
+};
+
+#if defined(_MSC_VER)
+ #include <intrin.h>
+
+ typedef struct uint128_t {
+ unsigned long long lo;
+ unsigned long long hi;
+ } uint128_t;
+
+ #define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+ #if defined(__SIZEOF_INT128__)
+ typedef unsigned __int128 uint128_t;
+ #else
+ typedef unsigned uint128_t __attribute__((mode(TI)));
+ #endif
+
+ #define POLY1305_NOINLINE __attribute__((noinline))
+#endif
+
+typedef struct poly1305_state_ref_t {
+ uint64_t r[3];
+ uint64_t h[3];
+ uint64_t pad[2];
+ unsigned char final;
+} poly1305_state_ref_t;
+
+/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */
+static uint64_t
+U8TO64(const unsigned char *p) {
+ return
+ ((uint64_t)p[0] ) |
+ ((uint64_t)p[1] << 8) |
+ ((uint64_t)p[2] << 16) |
+ ((uint64_t)p[3] << 24) |
+ ((uint64_t)p[4] << 32) |
+ ((uint64_t)p[5] << 40) |
+ ((uint64_t)p[6] << 48) |
+ ((uint64_t)p[7] << 56);
+}
+
+/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */
+static void
+U64TO8(unsigned char *p, uint64_t v) {
+ p[0] = (unsigned char)(v ) & 0xff;
+ p[1] = (unsigned char)(v >> 8) & 0xff;
+ p[2] = (unsigned char)(v >> 16) & 0xff;
+ p[3] = (unsigned char)(v >> 24) & 0xff;
+ p[4] = (unsigned char)(v >> 32) & 0xff;
+ p[5] = (unsigned char)(v >> 40) & 0xff;
+ p[6] = (unsigned char)(v >> 48) & 0xff;
+ p[7] = (unsigned char)(v >> 56) & 0xff;
+}
+
+static size_t
+poly1305_block_size_ref(void) {
+ return POLY1305_BLOCK_SIZE;
+}
+
+static void
+poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+ uint64_t t0, t1;
+
+ /* bytes_hint not used */
+ (void)bytes_hint;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ t0 = U8TO64(&key->b[0]);
+ t1 = U8TO64(&key->b[8]);
+ st->r[0] = ( t0 ) & 0xffc0fffffff;
+ st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+ st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+
+ /* save pad for later */
+ st->pad[0] = U8TO64(&key->b[16]);
+ st->pad[1] = U8TO64(&key->b[24]);
+
+ st->final = 0;
+}
+
+static void
+poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+ const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */
+ uint64_t r0,r1,r2;
+ uint64_t s1,s2;
+ uint64_t h0,h1,h2;
+ uint64_t c;
+ uint128_t d0,d1,d2;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+
+ s1 = r1 * (5 << 2);
+ s2 = r2 * (5 << 2);
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ while (inlen >= POLY1305_BLOCK_SIZE) {
+ uint64_t t0, t1;
+
+ /* h += in[i] */
+ t0 = U8TO64(in + 0);
+ t1 = U8TO64(in + 8);
+ h0 += (( t0 ) & 0xfffffffffff);
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
+ h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
+
+ /* h *= r */
+ d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1);
+ d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2);
+ d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0);
+
+ /* (partial) h %= p */
+ c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff;
+ d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff;
+ d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
+ h1 += c;
+
+ in += POLY1305_BLOCK_SIZE;
+ inlen -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+}
+
+static void
+poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) {
+ poly1305_state_ref_t *st = (poly1305_state_ref_t *)state;
+ uint64_t h0, h1, h2, c;
+ uint64_t g0, g1, g2;
+ uint64_t t0, t1;
+
+ /* process the remaining block */
+ if (remaining) {
+ unsigned char final[POLY1305_BLOCK_SIZE] = {0};
+ size_t i;
+ for (i = 0; i < remaining; i++)
+ final[i] = in[i];
+ final[remaining] = 1;
+ st->final = 1;
+ poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
+ g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
+ g2 = h2 + c - ((uint64_t)1 << 42);
+
+ /* select h if h < p, or h + -p if h >= p */
+ c = (g2 >> 63) - 1;
+ h0 = (h0 & ~c) | (g0 & c);
+ h1 = (h1 & ~c) | (g1 & c);
+ h2 = (h2 & ~c) | (g2 & c);
+
+ /* h = (h + pad) */
+ t0 = st->pad[0];
+ t1 = st->pad[1];
+
+ h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff;
+
+ /* mac = h % (2^128) */
+ h0 = ((h0 ) | (h1 << 44));
+ h1 = ((h1 >> 20) | (h2 << 24));
+
+ U64TO8(&mac[0], h0);
+ U64TO8(&mac[8], h1);
+
+ /* zero out the state */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->r[0] = 0;
+ st->r[1] = 0;
+ st->r[2] = 0;
+ st->pad[0] = 0;
+ st->pad[1] = 0;
+}
+
+
+static void
+poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) {
+ poly1305_state_ref_t st;
+ size_t blocks;
+ poly1305_init_ext_ref(&st, key, inlen);
+ blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1));
+ if (blocks) {
+ poly1305_blocks_ref(&st, in, blocks);
+ in += blocks;
+ inlen -= blocks;
+ }
+ poly1305_finish_ext_ref(&st, in, inlen, mac);
+}
+
--- /dev/null
+#include "../chacha20/macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN poly1305_block_size_sse2
+movl $32, %eax
+ret
+FN_END poly1305_block_size_sse2
+
+GLOBAL_HIDDEN_FN poly1305_init_ext_sse2
+poly1305_init_ext_sse2_local:
+pushq %r15
+xorps %xmm0, %xmm0
+testq %rdx, %rdx
+pushq %r14
+movq %rdx, %r11
+movq $-1, %rax
+cmove %rax, %r11
+pushq %r13
+movabsq $17575274610687, %r9
+pushq %r12
+pushq %rbp
+movq %r11, %r13
+movabsq $17592186044415, %rbp
+pushq %rbx
+xorl %ebx, %ebx
+movdqu %xmm0, 32(%rdi)
+movdqu %xmm0, (%rdi)
+movdqu %xmm0, 16(%rdi)
+movq 8(%rsi), %rcx
+movq (%rsi), %rax
+movq %rcx, %rdx
+shrq $24, %rcx
+andq %rax, %r9
+salq $20, %rdx
+shrq $44, %rax
+movq %r9, %r8
+orq %rax, %rdx
+shrq $26, %r8
+movabsq $17592181915647, %rax
+andq %rax, %rdx
+movabsq $68719475727, %rax
+andq %rax, %rcx
+movl %r9d, %eax
+andl $67108863, %eax
+movl %eax, 40(%rdi)
+movl %edx, %eax
+sall $18, %eax
+orl %r8d, %eax
+movq %rdx, %r8
+andl $67108863, %eax
+shrq $34, %r8
+movl %eax, 44(%rdi)
+movq %rdx, %rax
+shrq $8, %rax
+andl $67108863, %eax
+movl %eax, 48(%rdi)
+movl %ecx, %eax
+sall $10, %eax
+orl %r8d, %eax
+movq %rdi, %r8
+andl $67108863, %eax
+movl %eax, 52(%rdi)
+movq %rcx, %rax
+shrq $16, %rax
+movl %eax, 56(%rdi)
+movq 16(%rsi), %rax
+movq %rax, 104(%rdi)
+movq 24(%rsi), %rax
+movq %rdx, %rsi
+movq %rax, 112(%rdi)
+poly1305_init_ext_sse2_7:
+testq %rbx, %rbx
+jne poly1305_init_ext_sse2_4
+cmpq $16, %r13
+jbe poly1305_init_ext_sse2_5
+leaq 60(%r8), %rdi
+jmp poly1305_init_ext_sse2_6
+poly1305_init_ext_sse2_4:
+cmpq $96, %r13
+jb poly1305_init_ext_sse2_5
+leaq 80(%r8), %rdi
+poly1305_init_ext_sse2_6:
+imulq $20, %rcx, %r10
+movq $0, -48(%rsp)
+movq $0, -32(%rsp)
+leaq (%rsi,%rsi), %r14
+leaq (%r9,%r9), %r11
+movq %r10, %rax
+mulq %r14
+movq %rax, %r14
+movq %r9, %rax
+movq %rdx, %r15
+mulq %r9
+addq %rax, %r14
+movq %r14, %rax
+adcq %rdx, %r15
+leaq (%rcx,%rcx), %rdx
+andq %rbp, %rax
+movq %rax, -16(%rsp)
+movq %r11, %rax
+movq %rdx, -24(%rsp)
+mulq %rsi
+movq %rax, %r11
+movq %r10, %rax
+movq %rdx, %r12
+mulq %rcx
+movq -16(%rsp), %rcx
+addq %rax, %r11
+movq %r14, %rax
+adcq %rdx, %r12
+shrdq $44, %r15, %rax
+movq %rax, -56(%rsp)
+movq -24(%rsp), %rax
+addq -56(%rsp), %r11
+adcq -48(%rsp), %r12
+mulq %r9
+movq %r11, %r14
+andq %rbp, %r14
+movq %rax, %r9
+movq %rsi, %rax
+movq %rdx, %r10
+mulq %rsi
+addq %rax, %r9
+movq %r11, %rax
+adcq %rdx, %r10
+shrdq $44, %r12, %rax
+movq %rax, -40(%rsp)
+movabsq $4398046511103, %rax
+addq -40(%rsp), %r9
+adcq -32(%rsp), %r10
+andq %r9, %rax
+incq %rbx
+shrdq $42, %r10, %r9
+leaq (%r9,%r9,4), %r9
+addq %r9, %rcx
+movq %rcx, %r9
+shrq $44, %rcx
+addq %r14, %rcx
+andq %rbp, %r9
+movq %rcx, %rsi
+shrq $44, %rcx
+movq %r9, %rdx
+addq %rax, %rcx
+movl %r9d, %eax
+andq %rbp, %rsi
+andl $67108863, %eax
+shrq $26, %rdx
+movl %eax, (%rdi)
+movl %esi, %eax
+sall $18, %eax
+orl %edx, %eax
+movq %rsi, %rdx
+andl $67108863, %eax
+shrq $34, %rdx
+movl %eax, 4(%rdi)
+movq %rsi, %rax
+shrq $8, %rax
+andl $67108863, %eax
+movl %eax, 8(%rdi)
+movl %ecx, %eax
+sall $10, %eax
+orl %edx, %eax
+andl $67108863, %eax
+movl %eax, 12(%rdi)
+movq %rcx, %rax
+shrq $16, %rax
+cmpq $2, %rbx
+movl %eax, 16(%rdi)
+jne poly1305_init_ext_sse2_7
+poly1305_init_ext_sse2_5:
+movq $0, 120(%r8)
+popq %rbx
+popq %rbp
+popq %r12
+popq %r13
+popq %r14
+popq %r15
+ret
+FN_END poly1305_init_ext_sse2
+
+
+GLOBAL_HIDDEN_FN poly1305_blocks_sse2
+poly1305_blocks_sse2_local:
+pushq %rbp
+movq %rsp, %rbp
+pushq %rbx
+andq $-64, %rsp
+subq $328, %rsp
+movq $(1 << 24), %rax
+movd %rax, %xmm1
+movq $((1 << 26) - 1), %rax
+movd %rax, %xmm0
+pshufd $68, %xmm1, %xmm1
+pshufd $68, %xmm0, %xmm0
+movq 120(%rdi), %rax
+movaps %xmm1, 312(%rsp)
+testb $4, %al
+je poly1305_blocks_sse2_11
+movaps 312(%rsp), %xmm1
+psrldq $8, %xmm1
+movaps %xmm1, 312(%rsp)
+poly1305_blocks_sse2_11:
+testb $8, %al
+je poly1305_blocks_sse2_12
+xorps %xmm1, %xmm1
+movaps %xmm1, 312(%rsp)
+poly1305_blocks_sse2_12:
+testb $1, %al
+jne poly1305_blocks_sse2_13
+movq 16(%rsi), %xmm1
+movaps %xmm0, %xmm3
+movaps %xmm0, %xmm9
+movq (%rsi), %xmm15
+orq $1, %rax
+subq $32, %rdx
+movq 8(%rsi), %xmm12
+punpcklqdq %xmm1, %xmm15
+movq 24(%rsi), %xmm1
+movaps %xmm15, %xmm8
+pand %xmm15, %xmm3
+psrlq $52, %xmm15
+addq $32, %rsi
+punpcklqdq %xmm1, %xmm12
+movaps %xmm12, %xmm1
+psrlq $26, %xmm8
+psllq $12, %xmm1
+pand %xmm0, %xmm8
+movq %rax, 120(%rdi)
+por %xmm1, %xmm15
+psrlq $40, %xmm12
+pand %xmm15, %xmm9
+por 312(%rsp), %xmm12
+psrlq $26, %xmm15
+pand %xmm0, %xmm15
+jmp poly1305_blocks_sse2_14
+poly1305_blocks_sse2_13:
+movdqu (%rdi), %xmm8
+movdqu 16(%rdi), %xmm15
+movdqu 32(%rdi), %xmm12
+pshufd $80, %xmm8, %xmm3
+pshufd $250, %xmm8, %xmm8
+pshufd $80, %xmm15, %xmm9
+pshufd $250, %xmm15, %xmm15
+pshufd $80, %xmm12, %xmm12
+poly1305_blocks_sse2_14:
+movq 120(%rdi), %rax
+testb $48, %al
+je poly1305_blocks_sse2_15
+testb $16, %al
+movd 56(%rdi), %xmm2
+leaq 40(%rdi), %rax
+je poly1305_blocks_sse2_16
+movdqu 60(%rdi), %xmm1
+movdqu (%rax), %xmm4
+movd %xmm2, %eax
+movd 76(%rdi), %xmm2
+movaps %xmm1, %xmm7
+movd %eax, %xmm5
+punpckldq %xmm4, %xmm7
+punpckhdq %xmm4, %xmm1
+punpcklqdq %xmm5, %xmm2
+jmp poly1305_blocks_sse2_17
+poly1305_blocks_sse2_16:
+movdqu (%rax), %xmm1
+movl $1, %r8d
+movd %r8d, %xmm4
+movaps %xmm1, %xmm7
+punpckldq %xmm4, %xmm7
+punpckhdq %xmm4, %xmm1
+poly1305_blocks_sse2_17:
+pshufd $80, %xmm7, %xmm11
+pshufd $80, %xmm1, %xmm4
+pshufd $250, %xmm7, %xmm7
+movaps %xmm11, 168(%rsp)
+pshufd $250, %xmm1, %xmm1
+jmp poly1305_blocks_sse2_18
+poly1305_blocks_sse2_15:
+movdqu 60(%rdi), %xmm1
+movd 76(%rdi), %xmm2
+pshufd $0, %xmm2, %xmm2
+pshufd $0, %xmm1, %xmm11
+pshufd $85, %xmm1, %xmm7
+pshufd $170, %xmm1, %xmm4
+movaps %xmm11, 168(%rsp)
+pshufd $255, %xmm1, %xmm1
+poly1305_blocks_sse2_18:
+movaps %xmm1, %xmm14
+movaps %xmm7, %xmm5
+movaps %xmm4, %xmm13
+movaps %xmm1, 264(%rsp)
+movaps %xmm2, %xmm1
+cmpq $63, %rdx
+movq $(5), %r8
+movd %r8, %xmm6
+pshufd $68, %xmm6, %xmm6
+pmuludq %xmm6, %xmm5
+movaps %xmm4, 296(%rsp)
+pmuludq %xmm6, %xmm13
+movaps %xmm2, 152(%rsp)
+pmuludq %xmm6, %xmm14
+pmuludq %xmm6, %xmm1
+movaps %xmm5, 88(%rsp)
+movaps %xmm13, 72(%rsp)
+movaps %xmm14, 56(%rsp)
+movaps %xmm1, 40(%rsp)
+jbe poly1305_blocks_sse2_19
+movdqu 80(%rdi), %xmm1
+movd 96(%rdi), %xmm2
+movq %rdx, %rcx
+pshufd $0, %xmm2, %xmm2
+movaps %xmm2, 24(%rsp)
+pmuludq %xmm6, %xmm2
+pshufd $85, %xmm1, %xmm4
+movaps %xmm4, 280(%rsp)
+pmuludq %xmm6, %xmm4
+pshufd $255, %xmm1, %xmm13
+pshufd $170, %xmm1, %xmm5
+movaps 72(%rsp), %xmm14
+movaps %xmm5, 216(%rsp)
+pmuludq %xmm6, %xmm5
+movq %rsi, %rax
+movaps %xmm4, -24(%rsp)
+movaps %xmm13, %xmm4
+pshufd $0, %xmm1, %xmm1
+pmuludq %xmm6, %xmm4
+movaps %xmm14, -8(%rsp)
+movaps %xmm5, 8(%rsp)
+movaps 168(%rsp), %xmm5
+movaps %xmm1, 248(%rsp)
+movaps 56(%rsp), %xmm1
+movaps %xmm4, 120(%rsp)
+movaps 40(%rsp), %xmm4
+movaps %xmm13, 136(%rsp)
+movaps %xmm2, 200(%rsp)
+movaps %xmm1, 104(%rsp)
+movaps %xmm4, 184(%rsp)
+movaps %xmm5, 232(%rsp)
+jmp poly1305_blocks_sse2_20
+.p2align 6
+poly1305_blocks_sse2_20:
+movaps -24(%rsp), %xmm5
+movaps %xmm8, %xmm13
+subq $64, %rcx
+movaps 8(%rsp), %xmm4
+movaps 120(%rsp), %xmm10
+pmuludq %xmm12, %xmm5
+pmuludq %xmm15, %xmm4
+movaps 8(%rsp), %xmm2
+pmuludq %xmm9, %xmm10
+movaps 120(%rsp), %xmm11
+movaps 200(%rsp), %xmm14
+pmuludq %xmm12, %xmm2
+paddq %xmm4, %xmm5
+pmuludq %xmm15, %xmm11
+movaps 120(%rsp), %xmm1
+paddq %xmm10, %xmm5
+pmuludq %xmm8, %xmm14
+movaps 200(%rsp), %xmm10
+movaps 200(%rsp), %xmm4
+pmuludq %xmm12, %xmm1
+movaps 248(%rsp), %xmm8
+pmuludq %xmm15, %xmm10
+paddq %xmm11, %xmm2
+pmuludq %xmm12, %xmm4
+paddq %xmm14, %xmm5
+movaps 200(%rsp), %xmm11
+movaps 248(%rsp), %xmm14
+pmuludq %xmm15, %xmm8
+pmuludq 248(%rsp), %xmm12
+pmuludq %xmm9, %xmm11
+paddq %xmm10, %xmm1
+movaps 248(%rsp), %xmm10
+pmuludq 280(%rsp), %xmm15
+pmuludq %xmm3, %xmm14
+paddq %xmm15, %xmm12
+paddq %xmm8, %xmm4
+pmuludq %xmm13, %xmm10
+movq 24(%rax), %xmm15
+movaps 248(%rsp), %xmm8
+paddq %xmm11, %xmm2
+movaps %xmm3, %xmm11
+movaps 280(%rsp), %xmm3
+paddq %xmm14, %xmm5
+pmuludq %xmm9, %xmm8
+paddq %xmm10, %xmm2
+movq 16(%rax), %xmm14
+movaps 280(%rsp), %xmm10
+pmuludq %xmm9, %xmm3
+pmuludq 216(%rsp), %xmm9
+paddq %xmm9, %xmm12
+paddq %xmm8, %xmm1
+movq (%rax), %xmm8
+pmuludq %xmm11, %xmm10
+paddq %xmm3, %xmm4
+movaps 216(%rsp), %xmm3
+punpcklqdq %xmm14, %xmm8
+movaps 280(%rsp), %xmm14
+pmuludq %xmm13, %xmm3
+paddq %xmm10, %xmm2
+movq 8(%rax), %xmm10
+pmuludq %xmm13, %xmm14
+pmuludq 136(%rsp), %xmm13
+paddq %xmm13, %xmm12
+punpcklqdq %xmm15, %xmm10
+movaps %xmm10, %xmm9
+movaps 216(%rsp), %xmm15
+paddq %xmm3, %xmm4
+psllq $12, %xmm9
+movaps %xmm0, %xmm3
+paddq %xmm14, %xmm1
+pmuludq %xmm11, %xmm15
+pand %xmm8, %xmm3
+movaps 136(%rsp), %xmm14
+movaps %xmm3, -40(%rsp)
+movaps %xmm8, %xmm3
+movdqu 48(%rax), %xmm13
+psrlq $52, %xmm8
+pmuludq %xmm11, %xmm14
+paddq %xmm15, %xmm1
+por %xmm9, %xmm8
+pmuludq 24(%rsp), %xmm11
+paddq %xmm11, %xmm12
+movdqu 32(%rax), %xmm11
+movaps %xmm10, %xmm9
+psrlq $40, %xmm10
+pand %xmm0, %xmm8
+movaps %xmm11, %xmm15
+paddq %xmm14, %xmm4
+xorps %xmm14, %xmm14
+punpckldq %xmm13, %xmm15
+psrlq $14, %xmm9
+addq $64, %rax
+pand %xmm0, %xmm9
+psrlq $26, %xmm3
+cmpq $63, %rcx
+por 312(%rsp), %xmm10
+movaps %xmm13, -72(%rsp)
+movaps %xmm15, %xmm13
+punpckldq %xmm14, %xmm13
+punpckhdq -72(%rsp), %xmm11
+movaps %xmm13, -56(%rsp)
+movaps %xmm11, %xmm13
+punpckhdq %xmm14, %xmm11
+pand %xmm0, %xmm3
+psllq $18, %xmm11
+punpckhdq %xmm14, %xmm15
+punpckldq %xmm14, %xmm13
+paddq %xmm11, %xmm4
+movaps -8(%rsp), %xmm11
+psllq $6, %xmm15
+psllq $12, %xmm13
+movaps 88(%rsp), %xmm14
+paddq %xmm15, %xmm2
+pmuludq %xmm10, %xmm11
+paddq %xmm13, %xmm1
+movaps -8(%rsp), %xmm13
+pmuludq %xmm10, %xmm14
+paddq -56(%rsp), %xmm5
+paddq 312(%rsp), %xmm12
+pmuludq %xmm9, %xmm13
+movaps 104(%rsp), %xmm15
+paddq %xmm11, %xmm2
+movaps 184(%rsp), %xmm11
+paddq %xmm14, %xmm5
+movaps 104(%rsp), %xmm14
+pmuludq %xmm9, %xmm15
+pmuludq %xmm10, %xmm11
+paddq %xmm13, %xmm5
+movaps 104(%rsp), %xmm13
+pmuludq %xmm10, %xmm14
+pmuludq 232(%rsp), %xmm10
+paddq %xmm10, %xmm12
+pmuludq %xmm8, %xmm13
+paddq %xmm15, %xmm2
+movaps %xmm8, %xmm10
+paddq %xmm11, %xmm4
+pmuludq %xmm7, %xmm10
+movaps 232(%rsp), %xmm11
+movaps 184(%rsp), %xmm15
+paddq %xmm14, %xmm1
+pmuludq %xmm9, %xmm11
+paddq %xmm13, %xmm5
+movaps 184(%rsp), %xmm13
+movaps 184(%rsp), %xmm14
+pmuludq %xmm3, %xmm15
+pmuludq %xmm9, %xmm13
+paddq %xmm11, %xmm4
+pmuludq %xmm8, %xmm14
+movaps 232(%rsp), %xmm11
+paddq %xmm10, %xmm4
+paddq %xmm15, %xmm5
+pmuludq %xmm7, %xmm9
+pmuludq %xmm8, %xmm11
+paddq %xmm13, %xmm1
+movaps 232(%rsp), %xmm13
+movaps 296(%rsp), %xmm10
+paddq %xmm14, %xmm2
+pmuludq 296(%rsp), %xmm8
+movaps -40(%rsp), %xmm14
+pmuludq %xmm3, %xmm13
+paddq %xmm9, %xmm12
+paddq %xmm11, %xmm1
+movaps %xmm3, %xmm11
+paddq %xmm8, %xmm12
+movaps 232(%rsp), %xmm15
+pmuludq %xmm7, %xmm11
+pmuludq %xmm3, %xmm10
+paddq %xmm13, %xmm2
+movaps %xmm14, %xmm13
+movaps 296(%rsp), %xmm9
+pmuludq %xmm14, %xmm15
+pmuludq 264(%rsp), %xmm3
+paddq %xmm11, %xmm1
+pmuludq %xmm7, %xmm13
+paddq %xmm3, %xmm12
+movaps 264(%rsp), %xmm11
+paddq %xmm10, %xmm4
+pmuludq %xmm14, %xmm9
+paddq %xmm15, %xmm5
+pmuludq %xmm14, %xmm11
+movaps %xmm5, %xmm8
+paddq %xmm13, %xmm2
+psrlq $26, %xmm8
+paddq %xmm9, %xmm1
+pand %xmm0, %xmm5
+pmuludq 152(%rsp), %xmm14
+paddq %xmm14, %xmm12
+paddq %xmm8, %xmm2
+paddq %xmm11, %xmm4
+movaps %xmm2, %xmm9
+movaps %xmm2, %xmm8
+movaps %xmm4, %xmm3
+psrlq $26, %xmm9
+pand %xmm0, %xmm4
+psrlq $26, %xmm3
+paddq %xmm9, %xmm1
+pand %xmm0, %xmm8
+paddq %xmm3, %xmm12
+movaps %xmm1, %xmm10
+movaps %xmm1, %xmm9
+movaps %xmm12, %xmm3
+psrlq $26, %xmm10
+pand %xmm0, %xmm12
+psrlq $26, %xmm3
+paddq %xmm10, %xmm4
+pand %xmm0, %xmm9
+pmuludq %xmm6, %xmm3
+movaps %xmm4, %xmm1
+movaps %xmm4, %xmm15
+psrlq $26, %xmm1
+pand %xmm0, %xmm15
+paddq %xmm1, %xmm12
+paddq %xmm3, %xmm5
+movaps %xmm5, %xmm2
+movaps %xmm5, %xmm3
+psrlq $26, %xmm2
+pand %xmm0, %xmm3
+paddq %xmm2, %xmm8
+ja poly1305_blocks_sse2_20
+leaq -64(%rdx), %rax
+andl $63, %edx
+andq $-64, %rax
+leaq 64(%rsi,%rax), %rsi
+poly1305_blocks_sse2_19:
+cmpq $31, %rdx
+jbe poly1305_blocks_sse2_21
+movaps 56(%rsp), %xmm11
+movaps %xmm15, %xmm1
+movaps %xmm15, %xmm14
+movaps 72(%rsp), %xmm5
+movaps %xmm12, %xmm4
+movaps %xmm15, %xmm10
+movaps 88(%rsp), %xmm2
+pmuludq %xmm11, %xmm14
+movaps %xmm8, %xmm15
+pmuludq %xmm5, %xmm1
+movaps 40(%rsp), %xmm13
+testq %rsi, %rsi
+pmuludq %xmm12, %xmm2
+pmuludq %xmm12, %xmm5
+pmuludq %xmm11, %xmm4
+paddq %xmm1, %xmm2
+pmuludq %xmm9, %xmm11
+movaps %xmm12, %xmm1
+paddq %xmm14, %xmm5
+pmuludq %xmm13, %xmm15
+movaps %xmm9, %xmm14
+pmuludq %xmm13, %xmm14
+pmuludq %xmm13, %xmm1
+paddq %xmm11, %xmm2
+movaps 168(%rsp), %xmm11
+pmuludq %xmm10, %xmm13
+paddq %xmm15, %xmm2
+movaps %xmm9, %xmm15
+paddq %xmm14, %xmm5
+pmuludq %xmm11, %xmm12
+movaps %xmm3, %xmm14
+pmuludq %xmm11, %xmm14
+movaps %xmm13, 248(%rsp)
+movaps %xmm10, %xmm13
+pmuludq %xmm7, %xmm15
+paddq 248(%rsp), %xmm4
+pmuludq %xmm11, %xmm13
+pmuludq %xmm7, %xmm10
+paddq %xmm14, %xmm2
+movaps %xmm13, 280(%rsp)
+movaps %xmm8, %xmm13
+pmuludq %xmm11, %xmm13
+paddq %xmm10, %xmm12
+movaps 296(%rsp), %xmm10
+paddq 280(%rsp), %xmm1
+pmuludq %xmm9, %xmm11
+pmuludq 296(%rsp), %xmm9
+pmuludq %xmm3, %xmm10
+paddq %xmm9, %xmm12
+paddq %xmm13, %xmm5
+movaps %xmm3, %xmm13
+paddq %xmm15, %xmm1
+pmuludq %xmm7, %xmm13
+paddq %xmm11, %xmm4
+movaps 296(%rsp), %xmm11
+pmuludq %xmm8, %xmm7
+pmuludq %xmm8, %xmm11
+pmuludq 264(%rsp), %xmm8
+paddq %xmm8, %xmm12
+paddq %xmm13, %xmm5
+paddq %xmm7, %xmm4
+movaps 264(%rsp), %xmm7
+paddq %xmm11, %xmm1
+paddq %xmm10, %xmm4
+pmuludq %xmm3, %xmm7
+pmuludq 152(%rsp), %xmm3
+paddq %xmm3, %xmm12
+paddq %xmm7, %xmm1
+je poly1305_blocks_sse2_22
+movdqu (%rsi), %xmm7
+xorps %xmm3, %xmm3
+paddq 312(%rsp), %xmm12
+movdqu 16(%rsi), %xmm8
+movaps %xmm7, %xmm9
+punpckldq %xmm8, %xmm9
+punpckhdq %xmm8, %xmm7
+movaps %xmm9, %xmm10
+movaps %xmm7, %xmm8
+punpckldq %xmm3, %xmm10
+punpckhdq %xmm3, %xmm9
+punpckhdq %xmm3, %xmm7
+punpckldq %xmm3, %xmm8
+movaps %xmm8, %xmm3
+psllq $6, %xmm9
+paddq %xmm10, %xmm2
+psllq $12, %xmm3
+paddq %xmm9, %xmm5
+psllq $18, %xmm7
+paddq %xmm3, %xmm4
+paddq %xmm7, %xmm1
+poly1305_blocks_sse2_22:
+movaps %xmm2, %xmm8
+movaps %xmm1, %xmm3
+movaps %xmm1, %xmm15
+psrlq $26, %xmm8
+pand %xmm0, %xmm2
+pand %xmm0, %xmm15
+psrlq $26, %xmm3
+paddq %xmm5, %xmm8
+paddq %xmm12, %xmm3
+movaps %xmm8, %xmm9
+pand %xmm0, %xmm8
+movaps %xmm3, %xmm1
+psrlq $26, %xmm9
+movaps %xmm3, %xmm12
+psrlq $26, %xmm1
+paddq %xmm4, %xmm9
+pand %xmm0, %xmm12
+pmuludq %xmm1, %xmm6
+movaps %xmm9, %xmm3
+pand %xmm0, %xmm9
+psrlq $26, %xmm3
+paddq %xmm3, %xmm15
+paddq %xmm6, %xmm2
+movaps %xmm15, %xmm3
+pand %xmm0, %xmm15
+movaps %xmm2, %xmm1
+psrlq $26, %xmm3
+psrlq $26, %xmm1
+paddq %xmm3, %xmm12
+movaps %xmm0, %xmm3
+paddq %xmm1, %xmm8
+pand %xmm2, %xmm3
+poly1305_blocks_sse2_21:
+testq %rsi, %rsi
+je poly1305_blocks_sse2_23
+pshufd $8, %xmm3, %xmm3
+pshufd $8, %xmm8, %xmm8
+pshufd $8, %xmm9, %xmm9
+pshufd $8, %xmm15, %xmm15
+pshufd $8, %xmm12, %xmm12
+punpcklqdq %xmm8, %xmm3
+punpcklqdq %xmm15, %xmm9
+movdqu %xmm3, (%rdi)
+movdqu %xmm9, 16(%rdi)
+movq %xmm12, 32(%rdi)
+jmp poly1305_blocks_sse2_10
+poly1305_blocks_sse2_23:
+movaps %xmm3, %xmm0
+movaps %xmm8, %xmm4
+movaps %xmm9, %xmm2
+psrldq $8, %xmm0
+movaps %xmm15, %xmm10
+paddq %xmm0, %xmm3
+psrldq $8, %xmm4
+movaps %xmm12, %xmm0
+movd %xmm3, %edx
+paddq %xmm4, %xmm8
+psrldq $8, %xmm2
+movl %edx, %ecx
+movd %xmm8, %eax
+paddq %xmm2, %xmm9
+shrl $26, %ecx
+psrldq $8, %xmm10
+andl $67108863, %edx
+addl %ecx, %eax
+movd %xmm9, %ecx
+paddq %xmm10, %xmm15
+movl %eax, %r9d
+shrl $26, %eax
+psrldq $8, %xmm0
+addl %ecx, %eax
+movd %xmm15, %ecx
+paddq %xmm0, %xmm12
+movl %eax, %esi
+andl $67108863, %r9d
+movd %xmm12, %r10d
+shrl $26, %esi
+andl $67108863, %eax
+addl %ecx, %esi
+salq $8, %rax
+movl %r9d, %ecx
+shrl $18, %r9d
+movl %esi, %r8d
+shrl $26, %esi
+andl $67108863, %r8d
+addl %r10d, %esi
+orq %r9, %rax
+salq $16, %rsi
+movq %r8, %r9
+shrl $10, %r8d
+salq $26, %rcx
+orq %r8, %rsi
+salq $34, %r9
+orq %rdx, %rcx
+movq %rsi, %r11
+shrq $42, %rsi
+movabsq $17592186044415, %rdx
+orq %r9, %rax
+movabsq $4398046511103, %r8
+andq %rdx, %rcx
+andq %rdx, %rax
+andq %r8, %r11
+leaq (%rsi,%rsi,4), %rsi
+addq %rsi, %rcx
+movq %rcx, %r10
+shrq $44, %rcx
+addq %rcx, %rax
+andq %rdx, %r10
+movq %rax, %r9
+shrq $44, %rax
+addq %r11, %rax
+andq %rdx, %r9
+movabsq $-4398046511104, %r11
+movq %rax, %rcx
+andq %r8, %rcx
+shrq $42, %rax
+leaq (%rax,%rax,4), %rsi
+addq %rcx, %r11
+addq %r10, %rsi
+movq %rsi, %r8
+shrq $44, %rsi
+andq %rdx, %r8
+addq %r9, %rsi
+leaq 5(%r8), %r9
+movq %r9, %rbx
+andq %rdx, %r9
+shrq $44, %rbx
+addq %rsi, %rbx
+movq %rbx, %rax
+andq %rbx, %rdx
+shrq $44, %rax
+addq %rax, %r11
+movq %r11, %rax
+shrq $63, %rax
+decq %rax
+movq %rax, %r10
+andq %rax, %r9
+andq %rax, %rdx
+notq %r10
+andq %r11, %rax
+andq %r10, %r8
+andq %r10, %rsi
+andq %r10, %rcx
+orq %r9, %r8
+orq %rdx, %rsi
+orq %rax, %rcx
+movq %r8, (%rdi)
+movq %rsi, 8(%rdi)
+movq %rcx, 16(%rdi)
+poly1305_blocks_sse2_10:
+movq -8(%rbp), %rbx
+leave
+ret
+FN_END poly1305_blocks_sse2
+
+GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2
+poly1305_finish_ext_sse2_local:
+pushq %r12
+movq %rcx, %r12
+pushq %rbp
+movq %rdx, %rbp
+pushq %rbx
+movq %rdi, %rbx
+subq $32, %rsp
+testq %rdx, %rdx
+je poly1305_finish_ext_sse2_27
+xorl %eax, %eax
+movq %rsp, %rdi
+movl $8, %ecx
+rep stosl
+subq %rsp, %rsi
+testb $16, %dl
+movq %rsp, %rax
+je poly1305_finish_ext_sse2_28
+movdqu (%rsp,%rsi), %xmm0
+addq $16, %rax
+movaps %xmm0, (%rsp)
+poly1305_finish_ext_sse2_28:
+testb $8, %bpl
+je poly1305_finish_ext_sse2_29
+movq (%rax,%rsi), %rdx
+movq %rdx, (%rax)
+addq $8, %rax
+poly1305_finish_ext_sse2_29:
+testb $4, %bpl
+je poly1305_finish_ext_sse2_30
+movl (%rax,%rsi), %edx
+movl %edx, (%rax)
+addq $4, %rax
+poly1305_finish_ext_sse2_30:
+testb $2, %bpl
+je poly1305_finish_ext_sse2_31
+movw (%rax,%rsi), %dx
+movw %dx, (%rax)
+addq $2, %rax
+poly1305_finish_ext_sse2_31:
+testb $1, %bpl
+je poly1305_finish_ext_sse2_32
+movb (%rax,%rsi), %dl
+movb %dl, (%rax)
+poly1305_finish_ext_sse2_32:
+cmpq $16, %rbp
+je poly1305_finish_ext_sse2_33
+movb $1, (%rsp,%rbp)
+poly1305_finish_ext_sse2_33:
+cmpq $16, %rbp
+movl $32, %edx
+movq %rsp, %rsi
+sbbq %rax, %rax
+movq %rbx, %rdi
+andl $4, %eax
+addq $4, %rax
+orq %rax, 120(%rbx)
+call poly1305_blocks_sse2_local
+poly1305_finish_ext_sse2_27:
+movq 120(%rbx), %rax
+testb $1, %al
+je poly1305_finish_ext_sse2_35
+decq %rbp
+cmpq $15, %rbp
+jbe poly1305_finish_ext_sse2_36
+orq $16, %rax
+jmp poly1305_finish_ext_sse2_40
+poly1305_finish_ext_sse2_36:
+orq $32, %rax
+poly1305_finish_ext_sse2_40:
+movq %rax, 120(%rbx)
+movl $32, %edx
+xorl %esi, %esi
+movq %rbx, %rdi
+call poly1305_blocks_sse2_local
+poly1305_finish_ext_sse2_35:
+movq 8(%rbx), %rax
+movq 112(%rbx), %rsi
+movq %rax, %rdx
+movq %rax, %rcx
+movq 16(%rbx), %rax
+shrq $20, %rcx
+salq $44, %rdx
+orq (%rbx), %rdx
+salq $24, %rax
+orq %rcx, %rax
+movq 104(%rbx), %rcx
+addq %rcx, %rdx
+adcq %rsi, %rax
+xorps %xmm0, %xmm0
+movdqu %xmm0, (%rbx)
+movdqu %xmm0, 16(%rbx)
+movdqu %xmm0, 32(%rbx)
+movdqu %xmm0, 48(%rbx)
+movdqu %xmm0, 64(%rbx)
+movdqu %xmm0, 80(%rbx)
+movdqu %xmm0, 96(%rbx)
+movdqu %xmm0, 112(%rbx)
+movq %rdx, (%r12)
+movq %rax, 8(%r12)
+addq $32, %rsp
+popq %rbx
+popq %rbp
+popq %r12
+ret
+FN_END poly1305_finish_ext_sse2
+
+GLOBAL_HIDDEN_FN poly1305_auth_sse2
+cmpq $128, %rdx
+jb poly1305_auth_x86_local
+pushq %rbp
+movq %rsp, %rbp
+pushq %r14
+pushq %r13
+movq %rdi, %r13
+pushq %r12
+movq %rsi, %r12
+movq %rcx, %rsi
+pushq %rbx
+movq %rdx, %rbx
+andq $-64, %rsp
+movq %rbx, %r14
+addq $-128, %rsp
+movq %rsp, %rdi
+call poly1305_init_ext_sse2_local
+andq $-32, %r14
+je poly1305_auth_sse2_42
+movq %r12, %rsi
+movq %r14, %rdx
+movq %rsp, %rdi
+call poly1305_blocks_sse2_local
+addq %r14, %r12
+subq %r14, %rbx
+poly1305_auth_sse2_42:
+movq %r13, %rcx
+movq %rbx, %rdx
+movq %r12, %rsi
+movq %rsp, %rdi
+call poly1305_finish_ext_sse2_local
+leaq -32(%rbp), %rsp
+popq %rbx
+popq %r12
+popq %r13
+popq %r14
+popq %rbp
+ret
+FN_END poly1305_auth_sse2
+
+
+
+
+