diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-07-08 17:41:31 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-07-08 17:41:31 +0100 |
commit | 9026da71bb262886a275a3e24b1db51ab3395240 (patch) | |
tree | 6050f1a73472b3251b9fd18db7e8c96a7cca276c /src/libcryptobox/poly1305 | |
parent | c271eb36656a4ff88a9c8c1d59934949260275a3 (diff) | |
download | rspamd-9026da71bb262886a275a3e24b1db51ab3395240.tar.gz rspamd-9026da71bb262886a275a3e24b1db51ab3395240.zip |
[Rework] Use libsodium instead of hand crafted crypto implementations
Diffstat (limited to 'src/libcryptobox/poly1305')
-rw-r--r-- | src/libcryptobox/poly1305/avx.S | 877 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/avx2.S | 1095 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/constants.S | 21 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305.c | 224 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305.h | 38 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/poly1305_internal.h | 19 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/ref-32.c | 238 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/ref-64.c | 213 | ||||
-rw-r--r-- | src/libcryptobox/poly1305/sse2.S | 969 |
9 files changed, 0 insertions, 3694 deletions
diff --git a/src/libcryptobox/poly1305/avx.S b/src/libcryptobox/poly1305/avx.S deleted file mode 100644 index bf7390888..000000000 --- a/src/libcryptobox/poly1305/avx.S +++ /dev/null @@ -1,877 +0,0 @@ -#include "../macro.S" -#include "constants.S" - -SECTION_TEXT - -GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx,0,0 -movl $32, %eax -ret -FN_END poly1305_block_size_avx - -GLOBAL_HIDDEN_FN_EXT poly1305_init_ext_avx,4,1 -poly1305_init_ext_avx_local: -pushq %r15 -pushq %r14 -pushq %r13 -pushq %r12 -pushq %rbp -pushq %rbx -movq %rdi, %rbp -testq %rdx, %rdx -movq $-1, %rax -cmovne %rdx, %rax -movq %rax, -16(%rsp) -vpxor %xmm0, %xmm0, %xmm0 -vmovdqu %xmm0, (%rdi) -vmovdqu %xmm0, 16(%rdi) -vmovdqu %xmm0, 32(%rdi) -movq (%rsi), %r9 -movq 8(%rsi), %r8 -movabsq $17575274610687, %r10 -andq %r9, %r10 -shrq $44, %r9 -movq %r8, %rax -salq $20, %rax -orq %rax, %r9 -movabsq $17592181915647, %rax -andq %rax, %r9 -shrq $24, %r8 -movabsq $68719475727, %rax -andq %rax, %r8 -leaq 40(%rdi), %r15 -movl %r10d, %eax -andl $67108863, %eax -movl %eax, 40(%rdi) -movl %r9d, %edx -sall $18, %edx -movq %r10, %rax -shrq $26, %rax -orl %edx, %eax -andl $67108863, %eax -movl %eax, 44(%rdi) -movq %r9, %rax -shrq $8, %rax -andl $67108863, %eax -movl %eax, 48(%rdi) -movq %r9, %rax -shrq $34, %rax -movl %r8d, %edx -sall $10, %edx -orl %edx, %eax -andl $67108863, %eax -movl %eax, 52(%rdi) -movq %r8, %rax -shrq $16, %rax -movl %eax, 56(%rdi) -movq 16(%rsi), %rax -movq %rax, 104(%rdi) -movq 24(%rsi), %rax -movq %rax, 112(%rdi) -movl $0, %ebx -.L7: -testq %rbx, %rbx -jne .L4 -leaq 60(%rbp), %r15 -cmpq $16, -16(%rsp) -ja .L6 -jmp .L5 -.L4: -cmpq $1, %rbx -jne .L6 -leaq 80(%rbp), %r15 -cmpq $95, -16(%rsp) -jbe .L5 -.L6: -leaq (%r8,%r8,4), %rsi -salq $2, %rsi -leaq (%r9,%r9), %rdi -movq %rdi, %rax -mulq %rsi -movq %rax, %r13 -movq %rdx, %r14 -movq %r10, %rax -mulq %r10 -addq %r13, %rax -adcq %r14, %rdx -movabsq $17592186044415, %rcx -movq %rax, -72(%rsp) -movq %rdx, -64(%rsp) -andq -72(%rsp), %rcx -leaq (%r10,%r10), %r11 -movq %r11, %rax -mulq %r9 -movq %rax, %r11 -movq %rdx, %r12 -movq %rsi, %rax -mulq %r8 -movq %rax, %r13 -movq %rdx, %r14 -addq %r11, %r13 -adcq %r12, %r14 -movq -72(%rsp), %rax -movq -64(%rsp), %rdx -shrdq $44, %rdx, %rax -movq %rax, -56(%rsp) -movq $0, -48(%rsp) -addq -56(%rsp), %r13 -adcq -48(%rsp), %r14 -movabsq $17592186044415, %rsi -andq %r13, %rsi -leaq (%r8,%r8), %rdi -movq %rdi, %rax -mulq %r10 -movq %rax, %r11 -movq %rdx, %r12 -movq %r9, %rax -mulq %r9 -addq %r11, %rax -adcq %r12, %rdx -shrdq $44, %r14, %r13 -movq %r13, -40(%rsp) -movq $0, -32(%rsp) -addq -40(%rsp), %rax -adcq -32(%rsp), %rdx -movabsq $4398046511103, %rdi -andq %rax, %rdi -shrdq $42, %rdx, %rax -leaq (%rax,%rax,4), %r8 -addq %rcx, %r8 -movabsq $17592186044415, %r10 -andq %r8, %r10 -shrq $44, %r8 -addq %rsi, %r8 -movabsq $17592186044415, %r9 -andq %r8, %r9 -shrq $44, %r8 -addq %rdi, %r8 -movl %r10d, %eax -andl $67108863, %eax -movl %eax, (%r15) -movl %r9d, %edx -sall $18, %edx -movq %r10, %rax -shrq $26, %rax -orl %edx, %eax -andl $67108863, %eax -movl %eax, 4(%r15) -movq %r9, %rax -shrq $8, %rax -andl $67108863, %eax -movl %eax, 8(%r15) -movl %r8d, %edx -sall $10, %edx -movq %r9, %rax -shrq $34, %rax -orl %edx, %eax -andl $67108863, %eax -movl %eax, 12(%r15) -movq %r8, %rax -shrq $16, %rax -movl %eax, 16(%r15) -addq $1, %rbx -cmpq $2, %rbx -jne .L7 -.L5: -movq $0, 120(%rbp) -popq %rbx -popq %rbp -popq %r12 -popq %r13 -popq %r14 -popq %r15 -ret -FN_END poly1305_init_ext_avx - - - -GLOBAL_HIDDEN_FN poly1305_blocks_avx -poly1305_blocks_avx_local: -pushq %rbp -movq %rsp, %rbp -pushq %rbx -andq $-64, %rsp -subq $200, %rsp -movl $(1 << 24), %eax -movl $((1 << 26) - 1), %r8d -movl $(5), %r9d -vmovd %eax, %xmm1 -vmovd %r8d, %xmm0 -vmovd %r9d, %xmm2 -vpshufd $68, %xmm1, %xmm1 -vpshufd $68, %xmm0, %xmm0 -vpshufd $68, %xmm2, %xmm2 -vmovdqa %xmm1, 152(%rsp) -vmovdqa %xmm2, 184(%rsp) -movq 120(%rdi), %rax -testb $4, %al -je .L12 -vpsrldq $8, %xmm1, %xmm1 -vmovdqa %xmm1, 152(%rsp) -.L12: -testb $8, %al -je .L13 -vpxor %xmm1, %xmm1, %xmm1 -vmovdqa %xmm1, 152(%rsp) -.L13: -testb $1, %al -jne .L14 -vmovq (%rsi), %xmm1 -vpinsrq $1, 16(%rsi), %xmm1, %xmm1 -vmovq 8(%rsi), %xmm3 -vpinsrq $1, 24(%rsi), %xmm3, %xmm2 -vpand %xmm0, %xmm1, %xmm7 -vpsrlq $26, %xmm1, %xmm12 -vpand %xmm0, %xmm12, %xmm12 -vpsllq $12, %xmm2, %xmm3 -vpsrlq $52, %xmm1, %xmm1 -vpor %xmm3, %xmm1, %xmm6 -vpand %xmm0, %xmm6, %xmm3 -vpsrlq $26, %xmm6, %xmm6 -vpand %xmm0, %xmm6, %xmm6 -vpsrlq $40, %xmm2, %xmm2 -vpor 152(%rsp), %xmm2, %xmm2 -addq $32, %rsi -subq $32, %rdx -orq $1, %rax -movq %rax, 120(%rdi) -jmp .L15 -.L14: -vmovdqu (%rdi), %xmm12 -vmovdqu 16(%rdi), %xmm6 -vmovdqu 32(%rdi), %xmm2 -vpshufd $80, %xmm12, %xmm7 -vpshufd $250, %xmm12, %xmm12 -vpshufd $80, %xmm6, %xmm3 -vpshufd $250, %xmm6, %xmm6 -vpshufd $80, %xmm2, %xmm2 -.L15: -movq 120(%rdi), %rax -testb $48, %al -je .L16 -testb $16, %al -je .L17 -vmovdqu 40(%rdi), %xmm1 -vmovd 56(%rdi), %xmm4 -vmovdqu 60(%rdi), %xmm5 -vpunpckldq %xmm1, %xmm5, %xmm11 -vpunpckhdq %xmm1, %xmm5, %xmm5 -vmovd 76(%rdi), %xmm1 -vpunpcklqdq %xmm4, %xmm1, %xmm4 -jmp .L18 -.L17: -movl $(1), %r8d -vmovdqu 40(%rdi), %xmm5 -vmovd 56(%rdi), %xmm4 -vmovd %r8d, %xmm1 -vpunpckldq %xmm1, %xmm5, %xmm11 -vpunpckhdq %xmm1, %xmm5, %xmm5 -.L18: -vpshufd $80, %xmm11, %xmm1 -vpshufd $250, %xmm11, %xmm11 -vpshufd $80, %xmm5, %xmm10 -vpshufd $250, %xmm5, %xmm5 -jmp .L19 -.L16: -vmovdqu 60(%rdi), %xmm5 -vpshufd $0, %xmm5, %xmm1 -vpshufd $85, %xmm5, %xmm11 -vpshufd $170, %xmm5, %xmm10 -vpshufd $255, %xmm5, %xmm5 -vmovd 76(%rdi), %xmm4 -vpshufd $0, %xmm4, %xmm4 -.L19: -vmovdqa %xmm11, 136(%rsp) -vpmuludq 184(%rsp), %xmm11, %xmm13 -vmovdqa %xmm13, 120(%rsp) -vmovdqa %xmm10, 104(%rsp) -vpmuludq 184(%rsp), %xmm10, %xmm13 -vmovdqa %xmm13, 88(%rsp) -vmovdqa %xmm5, 72(%rsp) -vpmuludq 184(%rsp), %xmm5, %xmm5 -vmovdqa %xmm5, 56(%rsp) -vmovdqa %xmm4, 40(%rsp) -vpmuludq 184(%rsp), %xmm4, %xmm4 -vmovdqa %xmm4, 24(%rsp) -cmpq $63, %rdx -jbe .L20 -vmovdqu 80(%rdi), %xmm4 -vpshufd $0, %xmm4, %xmm5 -vmovdqa %xmm5, 8(%rsp) -vpshufd $85, %xmm4, %xmm5 -vmovdqa %xmm5, -8(%rsp) -vpshufd $170, %xmm4, %xmm13 -vmovdqa %xmm13, -24(%rsp) -vpshufd $255, %xmm4, %xmm4 -vmovdqa %xmm4, %xmm10 -vmovdqa %xmm4, -40(%rsp) -vmovd 96(%rdi), %xmm4 -vpshufd $0, %xmm4, %xmm4 -vmovdqa %xmm4, %xmm8 -vmovdqa %xmm4, -56(%rsp) -vpmuludq 184(%rsp), %xmm5, %xmm4 -vmovdqa %xmm4, -72(%rsp) -vpmuludq 184(%rsp), %xmm13, %xmm4 -vmovdqa %xmm4, -88(%rsp) -vpmuludq 184(%rsp), %xmm10, %xmm4 -vmovdqa %xmm4, -104(%rsp) -vpmuludq 184(%rsp), %xmm8, %xmm4 -vmovdqa %xmm4, -120(%rsp) -leaq 32(%rsi), %rax -movq %rdx, %rcx -vmovdqa %xmm1, 168(%rsp) -jmp .L22 -.p2align 6 -nop -nop -nop -nop -.L22: -vpmuludq -72(%rsp), %xmm2, %xmm13 -vmovdqa -88(%rsp), %xmm5 -vpmuludq %xmm5, %xmm6, %xmm4 -vpmuludq %xmm5, %xmm2, %xmm11 -vmovdqa -104(%rsp), %xmm9 -vpmuludq %xmm9, %xmm6, %xmm5 -vpmuludq %xmm9, %xmm2, %xmm10 -vpaddq %xmm4, %xmm13, %xmm13 -vpmuludq %xmm9, %xmm3, %xmm4 -vmovdqa -120(%rsp), %xmm8 -vpmuludq %xmm8, %xmm2, %xmm9 -vpaddq %xmm5, %xmm11, %xmm11 -vmovdqa %xmm8, %xmm5 -vpmuludq %xmm8, %xmm12, %xmm8 -vpmuludq %xmm5, %xmm3, %xmm14 -vpaddq %xmm4, %xmm13, %xmm13 -vpmuludq %xmm5, %xmm6, %xmm4 -vmovdqa 8(%rsp), %xmm15 -vpmuludq %xmm15, %xmm6, %xmm5 -vpaddq %xmm8, %xmm13, %xmm13 -vpmuludq %xmm15, %xmm2, %xmm8 -vpaddq %xmm14, %xmm11, %xmm11 -vpmuludq %xmm15, %xmm7, %xmm14 -vpaddq %xmm4, %xmm10, %xmm10 -vpmuludq %xmm15, %xmm12, %xmm4 -vpaddq %xmm5, %xmm9, %xmm9 -vpmuludq %xmm15, %xmm3, %xmm5 -vmovdqa -8(%rsp), %xmm15 -vpmuludq %xmm15, %xmm3, %xmm2 -vpaddq %xmm14, %xmm13, %xmm13 -vpmuludq %xmm15, %xmm6, %xmm6 -vpaddq %xmm4, %xmm11, %xmm11 -vpmuludq %xmm15, %xmm7, %xmm4 -vpaddq %xmm5, %xmm10, %xmm10 -vmovq -32(%rax), %xmm5 -vpinsrq $1, -16(%rax), %xmm5, %xmm5 -vpmuludq %xmm15, %xmm12, %xmm14 -vpaddq %xmm2, %xmm9, %xmm9 -vmovdqa -24(%rsp), %xmm2 -vpmuludq %xmm2, %xmm12, %xmm15 -vpaddq %xmm6, %xmm8, %xmm8 -vpmuludq %xmm2, %xmm3, %xmm3 -vpaddq %xmm4, %xmm11, %xmm11 -vmovq -24(%rax), %xmm4 -vpinsrq $1, -8(%rax), %xmm4, %xmm6 -vpmuludq %xmm2, %xmm7, %xmm4 -vpaddq %xmm14, %xmm10, %xmm10 -vmovdqa -40(%rsp), %xmm1 -vpmuludq %xmm1, %xmm7, %xmm14 -vpaddq %xmm15, %xmm9, %xmm9 -vpand %xmm5, %xmm0, %xmm2 -vpmuludq %xmm1, %xmm12, %xmm12 -vpaddq %xmm3, %xmm8, %xmm8 -vpsrlq $26, %xmm5, %xmm3 -vpand %xmm3, %xmm0, %xmm3 -vpmuludq -56(%rsp), %xmm7, %xmm7 -vpaddq %xmm4, %xmm10, %xmm10 -vpsllq $12, %xmm6, %xmm15 -vpsrlq $52, %xmm5, %xmm4 -vpor %xmm15, %xmm4, %xmm4 -vpaddq %xmm14, %xmm9, %xmm9 -vpsrlq $14, %xmm6, %xmm5 -vpand %xmm5, %xmm0, %xmm5 -vpaddq %xmm12, %xmm8, %xmm8 -vpand %xmm4, %xmm0, %xmm4 -vpaddq %xmm7, %xmm8, %xmm8 -vpsrlq $40, %xmm6, %xmm6 -vpor 152(%rsp), %xmm6, %xmm6 -vmovdqu (%rax), %xmm12 -vmovdqu 16(%rax), %xmm7 -vpunpckldq %xmm7, %xmm12, %xmm15 -vpunpckhdq %xmm7, %xmm12, %xmm7 -vpxor %xmm14, %xmm14, %xmm14 -vpunpckldq %xmm14, %xmm15, %xmm12 -vpunpckhdq %xmm14, %xmm15, %xmm15 -vpunpckldq %xmm14, %xmm7, %xmm14 -vpxor %xmm1, %xmm1, %xmm1 -vpunpckhdq %xmm1, %xmm7, %xmm7 -vpsllq $6, %xmm15, %xmm15 -vpsllq $12, %xmm14, %xmm14 -vpsllq $18, %xmm7, %xmm7 -vpaddq %xmm12, %xmm13, %xmm12 -vpaddq %xmm15, %xmm11, %xmm15 -vpaddq %xmm14, %xmm10, %xmm14 -vpaddq %xmm7, %xmm9, %xmm7 -vpaddq 152(%rsp), %xmm8, %xmm8 -vpmuludq 120(%rsp), %xmm6, %xmm13 -vmovdqa 88(%rsp), %xmm10 -vpmuludq %xmm10, %xmm5, %xmm9 -vpmuludq %xmm10, %xmm6, %xmm11 -vmovdqa 56(%rsp), %xmm1 -vpmuludq %xmm1, %xmm5, %xmm10 -vpaddq %xmm13, %xmm12, %xmm12 -vpmuludq %xmm1, %xmm6, %xmm13 -vpaddq %xmm9, %xmm12, %xmm12 -vpmuludq %xmm1, %xmm4, %xmm9 -vpaddq %xmm11, %xmm15, %xmm15 -vmovdqa 24(%rsp), %xmm1 -vpmuludq %xmm1, %xmm6, %xmm11 -vpaddq %xmm10, %xmm15, %xmm10 -vpmuludq %xmm1, %xmm3, %xmm15 -vpaddq %xmm13, %xmm14, %xmm14 -vpmuludq %xmm1, %xmm4, %xmm13 -vpaddq %xmm9, %xmm12, %xmm9 -vpmuludq %xmm1, %xmm5, %xmm12 -vpaddq %xmm11, %xmm7, %xmm7 -vpmuludq 168(%rsp), %xmm5, %xmm11 -vpaddq %xmm15, %xmm9, %xmm9 -vpmuludq 168(%rsp), %xmm6, %xmm6 -vpaddq %xmm13, %xmm10, %xmm10 -vpmuludq 168(%rsp), %xmm2, %xmm15 -vpaddq %xmm12, %xmm14, %xmm14 -vpmuludq 168(%rsp), %xmm3, %xmm13 -vpaddq %xmm11, %xmm7, %xmm11 -vpmuludq 168(%rsp), %xmm4, %xmm12 -vpaddq %xmm6, %xmm8, %xmm6 -vmovdqa 136(%rsp), %xmm8 -vpmuludq %xmm8, %xmm4, %xmm7 -vpaddq %xmm15, %xmm9, %xmm9 -vpmuludq %xmm8, %xmm5, %xmm5 -vpaddq %xmm13, %xmm10, %xmm10 -vpmuludq %xmm8, %xmm2, %xmm15 -vpaddq %xmm12, %xmm14, %xmm14 -vpmuludq %xmm8, %xmm3, %xmm8 -vpaddq %xmm7, %xmm11, %xmm11 -vmovdqa 104(%rsp), %xmm7 -vpmuludq %xmm7, %xmm3, %xmm13 -vpaddq %xmm5, %xmm6, %xmm6 -vpmuludq %xmm7, %xmm4, %xmm4 -vpaddq %xmm15, %xmm10, %xmm10 -vpmuludq %xmm7, %xmm2, %xmm15 -vpaddq %xmm8, %xmm14, %xmm14 -vmovdqa 72(%rsp), %xmm5 -vpmuludq %xmm5, %xmm2, %xmm7 -vpaddq %xmm13, %xmm11, %xmm11 -vpmuludq %xmm5, %xmm3, %xmm3 -vpaddq %xmm4, %xmm6, %xmm6 -vpmuludq 40(%rsp), %xmm2, %xmm2 -vpaddq %xmm15, %xmm14, %xmm14 -vpaddq %xmm7, %xmm11, %xmm11 -vpaddq %xmm3, %xmm6, %xmm6 -vpaddq %xmm2, %xmm6, %xmm2 -vpsrlq $26, %xmm9, %xmm12 -vpsrlq $26, %xmm11, %xmm5 -vpand %xmm0, %xmm9, %xmm9 -vpand %xmm0, %xmm11, %xmm11 -vpaddq %xmm12, %xmm10, %xmm10 -vpaddq %xmm5, %xmm2, %xmm2 -vpsrlq $26, %xmm10, %xmm3 -vpsrlq $26, %xmm2, %xmm7 -vpand %xmm0, %xmm10, %xmm10 -vpand %xmm0, %xmm2, %xmm2 -vpaddq %xmm3, %xmm14, %xmm3 -vpmuludq 184(%rsp), %xmm7, %xmm7 -vpaddq %xmm7, %xmm9, %xmm9 -vpsrlq $26, %xmm3, %xmm6 -vpsrlq $26, %xmm9, %xmm12 -vpand %xmm0, %xmm3, %xmm3 -vpand %xmm0, %xmm9, %xmm7 -vpaddq %xmm6, %xmm11, %xmm6 -vpaddq %xmm12, %xmm10, %xmm12 -vpsrlq $26, %xmm6, %xmm8 -vpand %xmm0, %xmm6, %xmm6 -vpaddq %xmm8, %xmm2, %xmm2 -subq $64, %rcx -addq $64, %rax -cmpq $63, %rcx -ja .L22 -vmovdqa 168(%rsp), %xmm1 -leaq -64(%rdx), %rax -andq $-64, %rax -leaq 64(%rsi,%rax), %rsi -andl $63, %edx -.L20: -cmpq $31, %rdx -jbe .L23 -vpmuludq 120(%rsp), %xmm2, %xmm11 -vmovdqa 88(%rsp), %xmm4 -vpmuludq %xmm4, %xmm6, %xmm0 -vpmuludq %xmm4, %xmm2, %xmm10 -vmovdqa 56(%rsp), %xmm4 -vpmuludq %xmm4, %xmm6, %xmm8 -vpmuludq %xmm4, %xmm2, %xmm5 -vpaddq %xmm0, %xmm11, %xmm11 -vpmuludq %xmm4, %xmm3, %xmm0 -vmovdqa 24(%rsp), %xmm13 -vpmuludq %xmm13, %xmm2, %xmm4 -vpaddq %xmm8, %xmm10, %xmm10 -vpmuludq %xmm13, %xmm12, %xmm8 -vpmuludq %xmm13, %xmm3, %xmm9 -vpaddq %xmm0, %xmm11, %xmm11 -vpmuludq %xmm13, %xmm6, %xmm13 -vpmuludq %xmm1, %xmm6, %xmm0 -vpaddq %xmm8, %xmm11, %xmm8 -vpmuludq %xmm1, %xmm2, %xmm2 -vpaddq %xmm9, %xmm10, %xmm9 -vpmuludq %xmm1, %xmm7, %xmm11 -vpaddq %xmm13, %xmm5, %xmm5 -vpmuludq %xmm1, %xmm12, %xmm10 -vpaddq %xmm0, %xmm4, %xmm0 -vpmuludq %xmm1, %xmm3, %xmm1 -vmovdqa 136(%rsp), %xmm4 -vpmuludq %xmm4, %xmm3, %xmm14 -vpaddq %xmm11, %xmm8, %xmm11 -vpmuludq %xmm4, %xmm6, %xmm6 -vpaddq %xmm10, %xmm9, %xmm9 -vpmuludq %xmm4, %xmm7, %xmm15 -vpaddq %xmm1, %xmm5, %xmm5 -vpmuludq %xmm4, %xmm12, %xmm1 -vpaddq %xmm14, %xmm0, %xmm0 -vmovdqa 104(%rsp), %xmm4 -vpmuludq %xmm4, %xmm12, %xmm8 -vpaddq %xmm6, %xmm2, %xmm2 -vpmuludq %xmm4, %xmm3, %xmm3 -vpaddq %xmm15, %xmm9, %xmm9 -vpmuludq %xmm4, %xmm7, %xmm10 -vpaddq %xmm1, %xmm5, %xmm1 -vmovdqa 72(%rsp), %xmm4 -vpmuludq %xmm4, %xmm7, %xmm15 -vpaddq %xmm8, %xmm0, %xmm0 -vpmuludq %xmm4, %xmm12, %xmm12 -vpaddq %xmm3, %xmm2, %xmm2 -vpmuludq 40(%rsp), %xmm7, %xmm7 -vpaddq %xmm10, %xmm1, %xmm1 -vpaddq %xmm15, %xmm0, %xmm0 -vpaddq %xmm12, %xmm2, %xmm2 -vpaddq %xmm7, %xmm2, %xmm2 -movl $((1 << 26) - 1), %r8d -testq %rsi, %rsi -vmovd %r8d, %xmm15 -je .L24 -vmovdqu (%rsi), %xmm4 -vmovdqu 16(%rsi), %xmm3 -vpunpckldq %xmm3, %xmm4, %xmm5 -vpunpckhdq %xmm3, %xmm4, %xmm3 -vpxor %xmm4, %xmm4, %xmm4 -vpunpckldq %xmm4, %xmm5, %xmm7 -vpunpckhdq %xmm4, %xmm5, %xmm5 -vpunpckldq %xmm4, %xmm3, %xmm6 -vpunpckhdq %xmm4, %xmm3, %xmm3 -vpsllq $6, %xmm5, %xmm5 -vpsllq $12, %xmm6, %xmm6 -vpsllq $18, %xmm3, %xmm3 -vpaddq %xmm7, %xmm11, %xmm11 -vpaddq %xmm5, %xmm9, %xmm9 -vpaddq %xmm6, %xmm1, %xmm1 -vpaddq %xmm3, %xmm0, %xmm0 -vpaddq 152(%rsp), %xmm2, %xmm2 -.L24: -vpshufd $68, %xmm15, %xmm15 -vpsrlq $26, %xmm11, %xmm12 -vpsrlq $26, %xmm0, %xmm3 -vpand %xmm15, %xmm11, %xmm11 -vpand %xmm15, %xmm0, %xmm6 -vpaddq %xmm12, %xmm9, %xmm9 -vpaddq %xmm3, %xmm2, %xmm2 -vpsrlq $26, %xmm9, %xmm3 -vpsrlq $26, %xmm2, %xmm7 -vpand %xmm15, %xmm9, %xmm9 -vpand %xmm15, %xmm2, %xmm2 -vpaddq %xmm3, %xmm1, %xmm3 -vpmuludq 184(%rsp), %xmm7, %xmm7 -vpaddq %xmm7, %xmm11, %xmm7 -vpsrlq $26, %xmm3, %xmm4 -vpsrlq $26, %xmm7, %xmm1 -vpand %xmm15, %xmm3, %xmm3 -vpand %xmm15, %xmm7, %xmm7 -vpaddq %xmm4, %xmm6, %xmm6 -vpaddq %xmm1, %xmm9, %xmm12 -vpsrlq $26, %xmm6, %xmm0 -vpand %xmm15, %xmm6, %xmm6 -vpaddq %xmm0, %xmm2, %xmm2 -.L23: -testq %rsi, %rsi -je .L25 -vpshufd $8, %xmm7, %xmm7 -vpshufd $8, %xmm12, %xmm12 -vpshufd $8, %xmm3, %xmm3 -vpshufd $8, %xmm6, %xmm6 -vpshufd $8, %xmm2, %xmm2 -vpunpcklqdq %xmm12, %xmm7, %xmm7 -vpunpcklqdq %xmm6, %xmm3, %xmm3 -vmovdqu %xmm7, (%rdi) -vmovdqu %xmm3, 16(%rdi) -vmovq %xmm2, 32(%rdi) -jmp .L11 -.L25: -vpsrldq $8, %xmm7, %xmm0 -vpaddq %xmm0, %xmm7, %xmm7 -vpsrldq $8, %xmm12, %xmm0 -vpaddq %xmm0, %xmm12, %xmm12 -vpsrldq $8, %xmm3, %xmm0 -vpaddq %xmm0, %xmm3, %xmm3 -vpsrldq $8, %xmm6, %xmm0 -vpaddq %xmm0, %xmm6, %xmm6 -vpsrldq $8, %xmm2, %xmm0 -vpaddq %xmm0, %xmm2, %xmm2 -vmovd %xmm7, %eax -vmovd %xmm12, %edx -movl %eax, %r9d -shrl $26, %r9d -addl %edx, %r9d -movl %r9d, %r8d -andl $67108863, %r8d -vmovd %xmm3, %edx -shrl $26, %r9d -addl %edx, %r9d -vmovd %xmm6, %edx -movl %r9d, %ecx -shrl $26, %ecx -addl %edx, %ecx -movl %ecx, %esi -andl $67108863, %esi -vmovd %xmm2, %r10d -movl %r8d, %r11d -salq $26, %r11 -andl $67108863, %eax -orq %rax, %r11 -movabsq $17592186044415, %rax -andq %rax, %r11 -andl $67108863, %r9d -salq $8, %r9 -shrl $18, %r8d -movl %r8d, %r8d -orq %r8, %r9 -movq %rsi, %rdx -salq $34, %rdx -orq %rdx, %r9 -andq %rax, %r9 -shrl $26, %ecx -addl %r10d, %ecx -salq $16, %rcx -shrl $10, %esi -movl %esi, %esi -orq %rsi, %rcx -movabsq $4398046511103, %r10 -movq %rcx, %r8 -andq %r10, %r8 -shrq $42, %rcx -leaq (%rcx,%rcx,4), %rdx -addq %r11, %rdx -movq %rdx, %rsi -andq %rax, %rsi -shrq $44, %rdx -addq %r9, %rdx -movq %rdx, %rcx -andq %rax, %rcx -shrq $44, %rdx -addq %r8, %rdx -andq %rdx, %r10 -shrq $42, %rdx -leaq (%rsi,%rdx,4), %rsi -leaq (%rsi,%rdx), %r11 -movq %r11, %rbx -andq %rax, %rbx -shrq $44, %r11 -addq %rcx, %r11 -leaq 5(%rbx), %r9 -movq %r9, %r8 -shrq $44, %r8 -addq %r11, %r8 -movabsq $-4398046511104, %rsi -addq %r10, %rsi -movq %r8, %rdx -shrq $44, %rdx -addq %rdx, %rsi -movq %rsi, %rdx -shrq $63, %rdx -subq $1, %rdx -movq %rdx, %rcx -notq %rcx -andq %rcx, %rbx -andq %rcx, %r11 -andq %r10, %rcx -andq %rax, %r9 -andq %rdx, %r9 -orq %r9, %rbx -movq %rbx, (%rdi) -andq %r8, %rax -andq %rdx, %rax -orq %rax, %r11 -movq %r11, 8(%rdi) -andq %rsi, %rdx -orq %rcx, %rdx -movq %rdx, 16(%rdi) -.L11: -movq -8(%rbp), %rbx -leave -ret -FN_END poly1305_blocks_avx - -GLOBAL_HIDDEN_FN poly1305_finish_ext_avx -poly1305_finish_ext_avx_local: -pushq %r12 -pushq %rbp -pushq %rbx -subq $32, %rsp -movq %rdi, %rbx -movq %rdx, %rbp -movq %rcx, %r12 -testq %rdx, %rdx -je .L30 -movq $0, (%rsp) -movq $0, 8(%rsp) -movq $0, 16(%rsp) -movq $0, 24(%rsp) -movq %rsp, %rax -subq %rsp, %rsi -testb $16, %dl -je .L31 -vmovdqu (%rsp,%rsi), %xmm0 -vmovdqa %xmm0, (%rsp) -addq $16, %rax -.L31: -testb $8, %bpl -je .L32 -movq (%rax,%rsi), %rdx -movq %rdx, (%rax) -addq $8, %rax -.L32: -testb $4, %bpl -je .L33 -movl (%rax,%rsi), %edx -movl %edx, (%rax) -addq $4, %rax -.L33: -testb $2, %bpl -je .L34 -movzwl (%rax,%rsi), %edx -movw %dx, (%rax) -addq $2, %rax -.L34: -testb $1, %bpl -je .L35 -movzbl (%rax,%rsi), %edx -movb %dl, (%rax) -.L35: -cmpq $16, %rbp -je .L36 -movb $1, (%rsp,%rbp) -movq 120(%rbx), %rdx -cmpq $16, %rbp -sbbq %rax, %rax -andl $4, %eax -addq $4, %rax -.L37: -orq %rdx, %rax -movq %rax, 120(%rbx) -movq %rsp, %rsi -movl $32, %edx -movq %rbx, %rdi -call poly1305_blocks_avx_local -.L30: -movq 120(%rbx), %rax -testb $1, %al -je .L38 -subq $1, %rbp -cmpq $15, %rbp -jbe .L39 -orq $16, %rax -movq %rax, 120(%rbx) -jmp .L40 -.L39: -orq $32, %rax -movq %rax, 120(%rbx) -.L40: -movl $32, %edx -movl $0, %esi -movq %rbx, %rdi -call poly1305_blocks_avx_local -.L38: -movq 8(%rbx), %rax -movq %rax, %rdx -salq $44, %rdx -orq (%rbx), %rdx -shrq $20, %rax -movq 16(%rbx), %rcx -salq $24, %rcx -orq %rcx, %rax -movq 104(%rbx), %rcx -movq 112(%rbx), %rsi -addq %rcx, %rdx -adcq %rsi, %rax -vpxor %xmm0, %xmm0, %xmm0 -vmovdqu %xmm0, (%rbx) -vmovdqu %xmm0, 16(%rbx) -vmovdqu %xmm0, 32(%rbx) -vmovdqu %xmm0, 48(%rbx) -vmovdqu %xmm0, 64(%rbx) -vmovdqu %xmm0, 80(%rbx) -vmovdqu %xmm0, 96(%rbx) -vmovdqu %xmm0, 112(%rbx) -movq %rdx, (%r12) -movq %rax, 8(%r12) -jmp .L43 -.L36: -movq 120(%rbx), %rdx -movl $4, %eax -jmp .L37 -.L43: -addq $32, %rsp -popq %rbx -popq %rbp -popq %r12 -ret -FN_END poly1305_finish_ext_avx - -GLOBAL_HIDDEN_FN poly1305_auth_avx -/* -cmp $128, %rdx -jb poly1305_auth_x86_local -*/ -pushq %rbp -movq %rsp, %rbp -pushq %r14 -pushq %r13 -pushq %r12 -pushq %rbx -andq $-64, %rsp -addq $-128, %rsp -movq %rdi, %r14 -movq %rsi, %r12 -movq %rdx, %rbx -movq %rsp, %rdi -movq %rcx, %rsi -call poly1305_init_ext_avx_local -movq %rbx, %r13 -andq $-32, %r13 -je .L46 -movq %rsp, %rdi -movq %r13, %rdx -movq %r12, %rsi -call poly1305_blocks_avx_local -addq %r13, %r12 -subq %r13, %rbx -.L46: -movq %rsp, %rdi -movq %r14, %rcx -movq %rbx, %rdx -movq %r12, %rsi -call poly1305_finish_ext_avx_local -leaq -32(%rbp), %rsp -popq %rbx -popq %r12 -popq %r13 -popq %r14 -popq %rbp -ret -FN_END poly1305_auth_avx diff --git a/src/libcryptobox/poly1305/avx2.S b/src/libcryptobox/poly1305/avx2.S deleted file mode 100644 index 5aa5851d6..000000000 --- a/src/libcryptobox/poly1305/avx2.S +++ /dev/null @@ -1,1095 +0,0 @@ -#include "../macro.S" -#include "constants.S" -SECTION_TEXT - -GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0 -movl $64, %eax -ret -FN_END poly1305_block_size_avx2 - -GLOBAL_HIDDEN_FN poly1305_auth_avx2 -/* -cmp $128, %rdx -jb poly1305_auth_x86_local -*/ -pushq %rbp -movq %rsp, %rbp -andq $-64, %rsp -pushq %r12 -pushq %r14 -pushq %r15 -pushq %rbx -subq $224, %rsp -movq %rsi, %r14 -movq %rdi, %rbx -lea (%rsp), %rdi -movq %rcx, %rsi -movq %rdx, %r12 -call poly1305_init_ext_avx2_local -poly1305_auth_avx2_2: -movq %r12, %r15 -andq $-64, %r15 -je poly1305_auth_avx2_5 -poly1305_auth_avx2_3: -movq %r14, %rsi -lea (%rsp), %rdi -movq %r15, %rdx -call poly1305_blocks_avx2_local -poly1305_auth_avx2_4: -addq %r15, %r14 -subq %r15, %r12 -poly1305_auth_avx2_5: -movq %r14, %rsi -lea (%rsp), %rdi -movq %r12, %rdx -movq %rbx, %rcx -call poly1305_finish_ext_avx2_local -poly1305_auth_avx2_6: -addq $224, %rsp -popq %rbx -popq %r15 -popq %r14 -popq %r12 -movq %rbp, %rsp -popq %rbp -ret -FN_END poly1305_auth_avx2 - - -GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2 -poly1305_finish_ext_avx2_local: -pushq %rbp -movq %rsp, %rbp -andq $-64, %rsp -pushq %r12 -pushq %r13 -pushq %r14 -subq $104, %rsp -movq %rdx, %r13 -movq %rcx, %r14 -movq %rdi, %r12 -testq %r13, %r13 -je poly1305_finish_ext_avx2_29 -poly1305_finish_ext_avx2_2: -lea (%rsp), %rax -vpxor %ymm0, %ymm0, %ymm0 -subq %rax, %rsi -vmovdqu %ymm0, (%rsp) -vmovdqu %ymm0, 32(%rsp) -testq $32, %r13 -je poly1305_finish_ext_avx2_4 -poly1305_finish_ext_avx2_3: -vmovdqu (%rsp,%rsi), %ymm0 -lea 32(%rsp), %rax -vmovdqu %ymm0, (%rsp) -poly1305_finish_ext_avx2_4: -testq $16, %r13 -je poly1305_finish_ext_avx2_6 -poly1305_finish_ext_avx2_5: -vmovdqu (%rax,%rsi), %xmm0 -vmovdqu %xmm0, (%rax) -addq $16, %rax -poly1305_finish_ext_avx2_6: -testq $8, %r13 -je poly1305_finish_ext_avx2_8 -poly1305_finish_ext_avx2_7: -movq (%rax,%rsi), %rdx -movq %rdx, (%rax) -addq $8, %rax -poly1305_finish_ext_avx2_8: -testq $4, %r13 -je poly1305_finish_ext_avx2_10 -poly1305_finish_ext_avx2_9: -movl (%rax,%rsi), %edx -movl %edx, (%rax) -addq $4, %rax -poly1305_finish_ext_avx2_10: -testq $2, %r13 -je poly1305_finish_ext_avx2_12 -poly1305_finish_ext_avx2_11: -movzwl (%rax,%rsi), %edx -movw %dx, (%rax) -addq $2, %rax -poly1305_finish_ext_avx2_12: -testq $1, %r13 -je poly1305_finish_ext_avx2_14 -poly1305_finish_ext_avx2_13: -movb (%rax,%rsi), %dl -movb %dl, (%rax) -poly1305_finish_ext_avx2_14: -testq $15, %r13 -je poly1305_finish_ext_avx2_16 -poly1305_finish_ext_avx2_15: -movb $1, (%rsp,%r13) -poly1305_finish_ext_avx2_16: -movq 176(%r12), %rdx -andq $-8125, %rdx -cmpq $48, %r13 -jb poly1305_finish_ext_avx2_18 -poly1305_finish_ext_avx2_17: -orq $4, %rdx -jmp poly1305_finish_ext_avx2_21 -poly1305_finish_ext_avx2_18: -cmpq $32, %r13 -jb poly1305_finish_ext_avx2_20 -poly1305_finish_ext_avx2_19: -orq $8, %rdx -jmp poly1305_finish_ext_avx2_21 -poly1305_finish_ext_avx2_20: -movq %rdx, %rax -orq $32, %rdx -orq $16, %rax -cmpq $16, %r13 -cmovae %rax, %rdx -poly1305_finish_ext_avx2_21: -testq $1, %rdx -je poly1305_finish_ext_avx2_27 -poly1305_finish_ext_avx2_22: -cmpq $16, %r13 -ja poly1305_finish_ext_avx2_24 -poly1305_finish_ext_avx2_23: -orq $256, %rdx -movq %rdx, 176(%r12) -jmp poly1305_finish_ext_avx2_28 -poly1305_finish_ext_avx2_24: -cmpq $32, %r13 -ja poly1305_finish_ext_avx2_27 -poly1305_finish_ext_avx2_25: -orq $128, %rdx -movq %rdx, 176(%r12) -jmp poly1305_finish_ext_avx2_28 -poly1305_finish_ext_avx2_27: -movq %rdx, 176(%r12) -poly1305_finish_ext_avx2_28: -movq %r12, %rdi -lea (%rsp), %rsi -movl $64, %edx -vzeroupper -call poly1305_blocks_avx2_local -poly1305_finish_ext_avx2_29: -movq 176(%r12), %rdx -testq $1, %rdx -je poly1305_finish_ext_avx2_37 -poly1305_finish_ext_avx2_30: -andq $-8125, %rdx -testq %r13, %r13 -je poly1305_finish_ext_avx2_32 -poly1305_finish_ext_avx2_31: -cmpq $48, %r13 -jbe poly1305_finish_ext_avx2_33 -poly1305_finish_ext_avx2_32: -orq $512, %rdx -jmp poly1305_finish_ext_avx2_36 -poly1305_finish_ext_avx2_33: -cmpq $32, %r13 -jbe poly1305_finish_ext_avx2_35 -poly1305_finish_ext_avx2_34: -orq $1024, %rdx -jmp poly1305_finish_ext_avx2_36 -poly1305_finish_ext_avx2_35: -movq %rdx, %rax -orq $4096, %rdx -orq $2048, %rax -cmpq $16, %r13 -cmova %rax, %rdx -poly1305_finish_ext_avx2_36: -orq $96, %rdx -movq %r12, %rdi -vpxor %ymm0, %ymm0, %ymm0 -lea (%rsp), %rsi -movq %rdx, 176(%r12) -movl $64, %edx -vmovdqu %ymm0, (%rsp) -vmovdqu %ymm0, 32(%rsp) -vzeroupper -call poly1305_blocks_avx2_local -poly1305_finish_ext_avx2_37: -movq 8(%r12), %r8 -movq %r8, %rsi -movq 16(%r12), %rax -vpxor %ymm0, %ymm0, %ymm0 -shlq $44, %rsi -shrq $20, %r8 -shlq $24, %rax -orq (%r12), %rsi -orq %rax, %r8 -movq 160(%r12), %rdx -movq 168(%r12), %rcx -addq %rdx, %rsi -adcq %rcx, %r8 -vmovdqu %ymm0, (%r12) -vmovdqu %ymm0, 32(%r12) -vmovdqu %ymm0, 64(%r12) -vmovdqu %ymm0, 96(%r12) -vmovdqu %ymm0, 128(%r12) -vmovdqu %ymm0, 160(%r12) -movq %rsi, (%r14) -movq %r8, 8(%r14) -vzeroupper -addq $104, %rsp -popq %r14 -popq %r13 -popq %r12 -movq %rbp, %rsp -popq %rbp -ret -FN_END poly1305_finish_ext_avx2 - -GLOBAL_HIDDEN_FN poly1305_blocks_avx2 -poly1305_blocks_avx2_local: -pushq %rbp -movq %rsp, %rbp -andq $-64, %rsp -subq $384, %rsp -movl $16777216, %eax -movl $67108863, %ecx -movl $5, %r8d -vmovd %eax, %xmm1 -vmovd %ecx, %xmm10 -vmovd %r8d, %xmm0 -movq 176(%rdi), %rax -vpbroadcastq %xmm1, %ymm1 -vpbroadcastq %xmm10, %ymm10 -vpbroadcastq %xmm0, %ymm11 -testq $60, %rax -je poly1305_blocks_avx2_11 -poly1305_blocks_avx2_2: -vpsrldq $8, %ymm1, %ymm15 -testq $4, %rax -je poly1305_blocks_avx2_4 -poly1305_blocks_avx2_3: -vpermq $192, %ymm15, %ymm15 -poly1305_blocks_avx2_4: -testq $8, %rax -je poly1305_blocks_avx2_6 -poly1305_blocks_avx2_5: -vpermq $240, %ymm15, %ymm15 -poly1305_blocks_avx2_6: -testq $16, %rax -je poly1305_blocks_avx2_8 -poly1305_blocks_avx2_7: -vpermq $252, %ymm15, %ymm15 -poly1305_blocks_avx2_8: -testq $32, %rax -je poly1305_blocks_avx2_10 -poly1305_blocks_avx2_9: -vpxor %ymm15, %ymm15, %ymm15 -poly1305_blocks_avx2_10: -vmovdqa %ymm15, %ymm1 -poly1305_blocks_avx2_11: -movq %rax, %rcx -btsq $0, %rcx -jc poly1305_blocks_avx2_13 -poly1305_blocks_avx2_12: -vmovdqu (%rsi), %ymm3 -movq %rcx, %rax -vmovdqu 32(%rsi), %ymm5 -vpunpcklqdq %ymm5, %ymm3, %ymm4 -addq $64, %rsi -vpunpckhqdq %ymm5, %ymm3, %ymm7 -vpermq $216, %ymm4, %ymm6 -addq $-64, %rdx -vpermq $216, %ymm7, %ymm0 -vpsrlq $52, %ymm6, %ymm8 -vpsllq $12, %ymm0, %ymm9 -vpsrlq $26, %ymm6, %ymm2 -vpsrlq $40, %ymm0, %ymm0 -vpand %ymm6, %ymm10, %ymm4 -vpor %ymm9, %ymm8, %ymm7 -vpand %ymm2, %ymm10, %ymm3 -vpor %ymm1, %ymm0, %ymm9 -vpsrlq $26, %ymm7, %ymm2 -vpand %ymm7, %ymm10, %ymm5 -vpand %ymm2, %ymm10, %ymm7 -movq %rax, 176(%rdi) -jmp poly1305_blocks_avx2_14 -poly1305_blocks_avx2_13: -vpermq $216, (%rdi), %ymm15 -vpxor %ymm0, %ymm0, %ymm0 -vpermq $216, 32(%rdi), %ymm14 -vpermq $216, 64(%rdi), %ymm13 -vpunpckldq %ymm0, %ymm15, %ymm4 -vpunpckhdq %ymm0, %ymm15, %ymm3 -vpunpckldq %ymm0, %ymm14, %ymm5 -vpunpckhdq %ymm0, %ymm14, %ymm7 -vpunpckldq %ymm0, %ymm13, %ymm9 -poly1305_blocks_avx2_14: -cmpq $64, %rdx -jb poly1305_blocks_avx2_34 -poly1305_blocks_avx2_15: -vmovdqu 140(%rdi), %ymm0 -testq $8064, %rax -je poly1305_blocks_avx2_29 -poly1305_blocks_avx2_16: -vpermq $216, 80(%rdi), %ymm6 -vpermq $216, 100(%rdi), %ymm2 -vpermq $216, 120(%rdi), %ymm8 -vpermq $216, %ymm0, %ymm0 -testq $128, %rax -je poly1305_blocks_avx2_18 -poly1305_blocks_avx2_17: -vmovdqa %ymm0, %ymm15 -vmovdqa %ymm0, %ymm14 -vmovdqa %ymm0, %ymm13 -vmovdqa %ymm8, %ymm12 -jmp poly1305_blocks_avx2_28 -poly1305_blocks_avx2_18: -testq $256, %rax -je poly1305_blocks_avx2_20 -poly1305_blocks_avx2_19: -vmovdqa %ymm0, %ymm15 -vmovdqa %ymm0, %ymm14 -vmovdqa %ymm8, %ymm13 -vmovdqa %ymm2, %ymm12 -jmp poly1305_blocks_avx2_28 -poly1305_blocks_avx2_20: -testq $512, %rax -je poly1305_blocks_avx2_22 -poly1305_blocks_avx2_21: -vmovdqa %ymm0, %ymm15 -vmovdqa %ymm8, %ymm14 -vmovdqa %ymm2, %ymm13 -vmovdqa %ymm6, %ymm12 -jmp poly1305_blocks_avx2_28 -poly1305_blocks_avx2_22: -testq $1024, %rax -je poly1305_blocks_avx2_24 -poly1305_blocks_avx2_23: -vpxor %ymm12, %ymm12, %ymm12 -movl $1, %r8d -vmovdqa %ymm8, %ymm15 -vmovdqa %ymm2, %ymm14 -vmovdqa %ymm6, %ymm13 -vmovd %r8d, %xmm12 -jmp poly1305_blocks_avx2_28 -poly1305_blocks_avx2_24: -testq $2048, %rax -je poly1305_blocks_avx2_26 -poly1305_blocks_avx2_25: -vpxor %ymm12, %ymm12, %ymm12 -movl $1, %r8d -vmovd %r8d, %xmm13 -vmovdqa %ymm2, %ymm15 -vmovdqa %ymm6, %ymm14 -vmovdqa %ymm13, %ymm12 -jmp poly1305_blocks_avx2_28 -poly1305_blocks_avx2_26: -testq $4096, %rax -je poly1305_blocks_avx2_28 -poly1305_blocks_avx2_27: -movl $1, %r8d -vmovd %r8d, %xmm14 -vmovdqa %ymm6, %ymm15 -vmovdqa %ymm14, %ymm13 -vmovdqa %ymm14, %ymm12 -poly1305_blocks_avx2_28: -vpunpcklqdq %ymm14, %ymm15, %ymm6 -vpunpcklqdq %ymm12, %ymm13, %ymm8 -vpunpckhqdq %ymm14, %ymm15, %ymm14 -vpunpckhqdq %ymm12, %ymm13, %ymm12 -vperm2i128 $32, %ymm8, %ymm6, %ymm2 -vperm2i128 $49, %ymm8, %ymm6, %ymm6 -vpsrlq $32, %ymm6, %ymm0 -vpsrlq $32, %ymm2, %ymm8 -vmovdqu %ymm0, 352(%rsp) -vperm2i128 $32, %ymm12, %ymm14, %ymm13 -vmovdqu %ymm13, 320(%rsp) -jmp poly1305_blocks_avx2_30 -poly1305_blocks_avx2_29: -vpsrlq $32, %ymm0, %ymm12 -vpermq $0, %ymm0, %ymm2 -vpermq $85, %ymm0, %ymm6 -vpermq $85, %ymm12, %ymm13 -vpermq $170, %ymm0, %ymm0 -vpermq $0, %ymm12, %ymm8 -vmovdqu %ymm13, 352(%rsp) -vmovdqu %ymm0, 320(%rsp) -poly1305_blocks_avx2_30: -vmovdqu (%rsi), %ymm12 -movq %rdx, %r9 -vmovdqu 352(%rsp), %ymm15 -vmovdqu %ymm1, 160(%rsp) -vmovdqu %ymm10, 192(%rsp) -vmovdqu %ymm11, 128(%rsp) -vperm2i128 $32, 32(%rsi), %ymm12, %ymm13 -xorl %r8d, %r8d -vperm2i128 $49, 32(%rsi), %ymm12, %ymm12 -xorl %ecx, %ecx -vpmuludq %ymm11, %ymm8, %ymm0 -vpmuludq %ymm11, %ymm6, %ymm1 -vmovdqu %ymm0, 224(%rsp) -vmovdqu %ymm1, 256(%rsp) -vpunpckldq %ymm12, %ymm13, %ymm14 -vpunpckhdq %ymm12, %ymm13, %ymm12 -vmovdqu %ymm14, 32(%rsp) -vpmuludq %ymm0, %ymm9, %ymm0 -vpmuludq %ymm1, %ymm7, %ymm13 -vpaddq %ymm13, %ymm0, %ymm0 -vpmuludq %ymm11, %ymm15, %ymm10 -vpmuludq %ymm10, %ymm5, %ymm13 -vpaddq %ymm13, %ymm0, %ymm0 -vmovdqu %ymm10, 288(%rsp) -vpmuludq 320(%rsp), %ymm11, %ymm11 -vpmuludq %ymm11, %ymm3, %ymm13 -vpaddq %ymm13, %ymm0, %ymm0 -vmovdqu %ymm11, (%rsp) -vpmuludq %ymm2, %ymm4, %ymm13 -vpaddq %ymm13, %ymm0, %ymm0 -vpxor %ymm13, %ymm13, %ymm13 -vpunpckldq %ymm13, %ymm14, %ymm14 -vpaddq %ymm14, %ymm0, %ymm0 -vmovdqu %ymm0, 64(%rsp) -vpmuludq %ymm11, %ymm9, %ymm14 -vpmuludq %ymm2, %ymm7, %ymm0 -vpaddq %ymm0, %ymm14, %ymm14 -vpmuludq %ymm8, %ymm5, %ymm0 -vpaddq %ymm0, %ymm14, %ymm14 -vpmuludq %ymm6, %ymm3, %ymm0 -vpaddq %ymm0, %ymm14, %ymm14 -vpmuludq %ymm15, %ymm4, %ymm0 -vpaddq %ymm0, %ymm14, %ymm0 -vpunpckhdq %ymm13, %ymm12, %ymm14 -vpsllq $18, %ymm14, %ymm14 -vpaddq %ymm14, %ymm0, %ymm14 -vpmuludq %ymm1, %ymm9, %ymm1 -vpmuludq %ymm10, %ymm7, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vpmuludq %ymm11, %ymm5, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vpmuludq %ymm2, %ymm3, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vpmuludq %ymm8, %ymm4, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vmovdqu 32(%rsp), %ymm0 -vpunpckhdq %ymm13, %ymm0, %ymm0 -vpsllq $6, %ymm0, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vmovdqu 64(%rsp), %ymm0 -vpsrlq $26, %ymm0, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vmovdqu %ymm1, 96(%rsp) -vpmuludq %ymm2, %ymm9, %ymm1 -vpmuludq %ymm8, %ymm7, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vpmuludq %ymm10, %ymm9, %ymm10 -vpmuludq %ymm11, %ymm7, %ymm11 -vpaddq %ymm11, %ymm10, %ymm7 -vpmuludq %ymm6, %ymm5, %ymm0 -vpaddq %ymm0, %ymm1, %ymm1 -vpmuludq %ymm2, %ymm5, %ymm5 -vpaddq %ymm5, %ymm7, %ymm10 -vpmuludq %ymm15, %ymm3, %ymm15 -vpaddq %ymm15, %ymm1, %ymm1 -vpmuludq %ymm8, %ymm3, %ymm11 -vpaddq %ymm11, %ymm10, %ymm5 -vpunpckldq %ymm13, %ymm12, %ymm10 -vmovdqu 96(%rsp), %ymm12 -vpmuludq 320(%rsp), %ymm4, %ymm0 -vpaddq %ymm0, %ymm1, %ymm15 -vpsrlq $26, %ymm12, %ymm3 -vmovdqu 160(%rsp), %ymm1 -vpmuludq %ymm6, %ymm4, %ymm4 -vpaddq %ymm1, %ymm15, %ymm0 -vpsrlq $26, %ymm14, %ymm15 -vpaddq %ymm4, %ymm5, %ymm11 -vpsllq $12, %ymm10, %ymm4 -vmovdqu 192(%rsp), %ymm10 -vpaddq %ymm15, %ymm0, %ymm0 -vpaddq %ymm4, %ymm11, %ymm5 -vmovdqu 128(%rsp), %ymm11 -vpsrlq $26, %ymm0, %ymm9 -vpaddq %ymm3, %ymm5, %ymm7 -vpand 64(%rsp), %ymm10, %ymm13 -vpand %ymm10, %ymm12, %ymm12 -vpand %ymm10, %ymm7, %ymm5 -vpsrlq $26, %ymm7, %ymm7 -vpmuludq %ymm11, %ymm9, %ymm15 -vpand %ymm10, %ymm14, %ymm9 -vpaddq %ymm15, %ymm13, %ymm3 -vpand %ymm10, %ymm0, %ymm14 -vpaddq %ymm7, %ymm9, %ymm9 -vpand %ymm10, %ymm3, %ymm4 -vpsrlq $26, %ymm3, %ymm3 -vpsrlq $26, %ymm9, %ymm0 -vpand %ymm10, %ymm9, %ymm7 -vpaddq %ymm3, %ymm12, %ymm3 -vpaddq %ymm0, %ymm14, %ymm9 -sarq $5, %r9 -shrq $58, %r9 -addq %rdx, %r9 -sarq $6, %r9 -cmpq $2, %r9 -jl poly1305_blocks_avx2_34 -poly1305_blocks_avx2_31: -vmovdqu %ymm6, 32(%rsp) -lea -64(%rdx), %r9 -vmovdqu %ymm8, 64(%rsp) -vmovdqu %ymm11, 128(%rsp) -vmovdqu %ymm10, 192(%rsp) -vmovdqu %ymm1, 160(%rsp) -vmovdqu (%rsp), %ymm12 -sarq $5, %r9 -shrq $58, %r9 -lea -64(%rdx,%r9), %rdx -sarq $6, %rdx -poly1305_blocks_avx2_32: -vmovdqu 256(%rsp), %ymm15 -incq %r8 -vmovdqu 64(%rcx,%rsi), %ymm11 -vpmuludq 224(%rsp), %ymm9, %ymm8 -vpmuludq %ymm15, %ymm7, %ymm14 -vpaddq %ymm14, %ymm8, %ymm1 -vmovdqu 288(%rsp), %ymm8 -vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10 -vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6 -addq $64, %rcx -vpmuludq %ymm8, %ymm5, %ymm13 -vpunpckldq %ymm6, %ymm10, %ymm0 -vpunpckhdq %ymm6, %ymm10, %ymm11 -vpaddq %ymm13, %ymm1, %ymm10 -vpmuludq %ymm12, %ymm3, %ymm6 -vpaddq %ymm6, %ymm10, %ymm14 -vpxor %ymm10, %ymm10, %ymm10 -vpunpckldq %ymm10, %ymm0, %ymm6 -vpunpckhdq %ymm10, %ymm0, %ymm0 -vpmuludq %ymm2, %ymm4, %ymm1 -vpaddq %ymm1, %ymm14, %ymm13 -vpaddq %ymm6, %ymm13, %ymm1 -vmovdqu 64(%rsp), %ymm6 -vmovdqu %ymm1, (%rsp) -vpsrlq $26, %ymm1, %ymm1 -vpmuludq %ymm12, %ymm9, %ymm14 -vpmuludq %ymm2, %ymm7, %ymm13 -vpaddq %ymm13, %ymm14, %ymm14 -vpmuludq %ymm6, %ymm5, %ymm13 -vpaddq %ymm13, %ymm14, %ymm14 -vpmuludq 32(%rsp), %ymm3, %ymm13 -vpaddq %ymm13, %ymm14, %ymm14 -vpmuludq 352(%rsp), %ymm4, %ymm13 -vpaddq %ymm13, %ymm14, %ymm13 -vpunpckhdq %ymm10, %ymm11, %ymm14 -vpsllq $18, %ymm14, %ymm14 -vpaddq %ymm14, %ymm13, %ymm13 -vpmuludq %ymm15, %ymm9, %ymm15 -vpmuludq %ymm8, %ymm7, %ymm14 -vpaddq %ymm14, %ymm15, %ymm15 -vpmuludq %ymm12, %ymm5, %ymm14 -vpaddq %ymm14, %ymm15, %ymm15 -vpmuludq %ymm2, %ymm3, %ymm14 -vpaddq %ymm14, %ymm15, %ymm15 -vpmuludq %ymm6, %ymm4, %ymm14 -vpaddq %ymm14, %ymm15, %ymm14 -vpsllq $6, %ymm0, %ymm15 -vpaddq %ymm15, %ymm14, %ymm14 -vmovdqu 32(%rsp), %ymm15 -vpaddq %ymm1, %ymm14, %ymm1 -vpmuludq %ymm2, %ymm9, %ymm0 -vpmuludq %ymm6, %ymm7, %ymm14 -vpmuludq %ymm8, %ymm9, %ymm9 -vpmuludq %ymm12, %ymm7, %ymm7 -vpaddq %ymm7, %ymm9, %ymm7 -vpaddq %ymm14, %ymm0, %ymm0 -vpsrlq $26, %ymm1, %ymm9 -vpmuludq %ymm15, %ymm5, %ymm14 -vpmuludq %ymm2, %ymm5, %ymm5 -vpaddq %ymm5, %ymm7, %ymm5 -vpaddq %ymm14, %ymm0, %ymm0 -vpmuludq 352(%rsp), %ymm3, %ymm14 -vpmuludq %ymm6, %ymm3, %ymm3 -vpaddq %ymm3, %ymm5, %ymm5 -vpaddq %ymm14, %ymm0, %ymm0 -vpmuludq 320(%rsp), %ymm4, %ymm14 -vpmuludq %ymm15, %ymm4, %ymm4 -vpaddq %ymm4, %ymm5, %ymm5 -vpaddq %ymm14, %ymm0, %ymm0 -vpunpckldq %ymm10, %ymm11, %ymm4 -vpaddq 160(%rsp), %ymm0, %ymm14 -vpsrlq $26, %ymm13, %ymm0 -vpsllq $12, %ymm4, %ymm3 -vpaddq %ymm0, %ymm14, %ymm14 -vpaddq %ymm3, %ymm5, %ymm7 -vpsrlq $26, %ymm14, %ymm0 -vpaddq %ymm9, %ymm7, %ymm10 -vmovdqu 192(%rsp), %ymm9 -vpsrlq $26, %ymm10, %ymm11 -vpand (%rsp), %ymm9, %ymm6 -vpand %ymm9, %ymm13, %ymm13 -vpand %ymm9, %ymm1, %ymm1 -vpand %ymm9, %ymm14, %ymm14 -vpand %ymm9, %ymm10, %ymm5 -vpmuludq 128(%rsp), %ymm0, %ymm8 -vpaddq %ymm8, %ymm6, %ymm15 -vpaddq %ymm11, %ymm13, %ymm0 -vpsrlq $26, %ymm15, %ymm3 -vpand %ymm9, %ymm0, %ymm7 -vpsrlq $26, %ymm0, %ymm0 -vpand %ymm9, %ymm15, %ymm4 -vpaddq %ymm3, %ymm1, %ymm3 -vpaddq %ymm0, %ymm14, %ymm9 -cmpq %rdx, %r8 -jb poly1305_blocks_avx2_32 -poly1305_blocks_avx2_34: -testq $64, %rax -jne poly1305_blocks_avx2_36 -poly1305_blocks_avx2_35: -vpshufd $8, %ymm4, %ymm0 -vpshufd $8, %ymm3, %ymm3 -vpshufd $8, %ymm5, %ymm5 -vpshufd $8, %ymm7, %ymm7 -vpshufd $8, %ymm9, %ymm9 -vpermq $8, %ymm0, %ymm1 -vpermq $8, %ymm3, %ymm2 -vpermq $8, %ymm5, %ymm4 -vpermq $8, %ymm7, %ymm6 -vpermq $8, %ymm9, %ymm11 -vperm2i128 $32, %ymm2, %ymm1, %ymm8 -vperm2i128 $32, %ymm6, %ymm4, %ymm10 -vmovdqu %ymm8, (%rdi) -vmovdqu %ymm10, 32(%rdi) -vmovdqu %xmm11, 64(%rdi) -jmp poly1305_blocks_avx2_37 -poly1305_blocks_avx2_36: -vpermq $245, %ymm4, %ymm0 -vpaddq %ymm0, %ymm4, %ymm4 -vpermq $245, %ymm3, %ymm1 -vpaddq %ymm1, %ymm3, %ymm10 -vpermq $245, %ymm5, %ymm3 -vpermq $170, %ymm4, %ymm6 -vpaddq %ymm3, %ymm5, %ymm13 -vpaddq %ymm6, %ymm4, %ymm8 -vpermq $170, %ymm10, %ymm11 -vpermq $245, %ymm7, %ymm5 -vpaddq %ymm11, %ymm10, %ymm12 -vpaddq %ymm5, %ymm7, %ymm7 -vpermq $170, %ymm13, %ymm14 -vpermq $245, %ymm9, %ymm2 -vpaddq %ymm14, %ymm13, %ymm15 -vpaddq %ymm2, %ymm9, %ymm9 -vpermq $170, %ymm7, %ymm0 -vpaddq %ymm0, %ymm7, %ymm1 -vpermq $170, %ymm9, %ymm2 -vpaddq %ymm2, %ymm9, %ymm3 -vmovd %xmm8, %r9d -movl %r9d, %r8d -shrl $26, %r8d -andq $67108863, %r9 -vmovd %xmm12, %esi -addl %r8d, %esi -movl %esi, %r11d -shrl $26, %esi -andq $67108863, %r11 -vmovd %xmm15, %ecx -addl %esi, %ecx -movl %ecx, %eax -shrl $26, %eax -andq $67108863, %rcx -shlq $8, %rcx -vmovd %xmm1, %r8d -addl %eax, %r8d -movl %r8d, %r10d -shrl $26, %r8d -andq $67108863, %r10 -movq %r10, %rax -shrq $10, %rax -shlq $34, %r10 -vmovd %xmm3, %edx -addl %r8d, %edx -shlq $16, %rdx -orq %rdx, %rax -movq %rax, %r8 -shrq $42, %r8 -lea (%r8,%r8,4), %rdx -movq %r11, %r8 -shlq $26, %r8 -orq %r8, %r9 -movq $0xfffffffffff, %r8 -shrq $18, %r11 -andq %r8, %r9 -addq %r9, %rdx -orq %rcx, %r11 -movq %rdx, %rsi -orq %r10, %r11 -shrq $44, %rsi -andq %r8, %r11 -addq %r11, %rsi -movq $0x3ffffffffff, %r9 -movq %rsi, %r10 -andq %r9, %rax -shrq $44, %r10 -andq %r8, %rdx -addq %r10, %rax -movq %r8, %rcx -andq %rax, %r9 -andq %r8, %rsi -shrq $42, %rax -movq $0xfffffc0000000000, %r10 -lea (%rax,%rax,4), %r11 -addq %r11, %rdx -andq %rdx, %rcx -shrq $44, %rdx -addq %rdx, %rsi -lea 5(%rcx), %rdx -movq %rdx, %r11 -andq %r8, %rdx -shrq $44, %r11 -addq %rsi, %r11 -movq %r11, %rax -andq %r11, %r8 -shrq $44, %rax -addq %r9, %rax -addq %r10, %rax -movq %rax, %r10 -shrq $63, %r10 -decq %r10 -andn %rcx, %r10, %rcx -andq %r10, %rdx -orq %rdx, %rcx -andq %r10, %r8 -andn %rsi, %r10, %rdx -andq %r10, %rax -andn %r9, %r10, %rsi -orq %r8, %rdx -orq %rax, %rsi -movq %rcx, (%rdi) -movq %rdx, 8(%rdi) -movq %rsi, 16(%rdi) -poly1305_blocks_avx2_37: -vzeroupper -movq %rbp, %rsp -popq %rbp -ret -FN_END poly1305_blocks_avx2 - -GLOBAL_HIDDEN_FN poly1305_init_ext_avx2 -poly1305_init_ext_avx2_local: -pushq %r12 -pushq %r13 -pushq %r14 -pushq %r15 -pushq %rbx -movq %rdi, %r10 -vpxor %ymm0, %ymm0, %ymm0 -movq %rdx, %r12 -vpxor %xmm1, %xmm1, %xmm1 -vmovdqu %xmm1, 64(%r10) -vmovdqu %ymm0, (%r10) -vmovdqu %ymm0, 32(%r10) -movq $-1, %r8 -testq %r12, %r12 -movq 8(%rsi), %rdi -movq $0xffc0fffffff, %r9 -movq %rdi, %rcx -cmove %r8, %r12 -movq (%rsi), %r8 -andq %r8, %r9 -shrq $44, %r8 -movq $0xfffffc0ffff, %r11 -shlq $20, %rcx -shrq $24, %rdi -orq %rcx, %r8 -movq $0xffffffc0f, %rcx -andq %r11, %r8 -andq %rcx, %rdi -movq 16(%rsi), %rcx -movq %rcx, 160(%r10) -movq %r9, %rcx -movq 24(%rsi), %rdx -movq %rdx, 168(%r10) -movl %r9d, %edx -andl $67108863, %edx -movl %edx, 80(%r10) -movq %r8, %rdx -shrq $26, %rcx -shlq $18, %rdx -orq %rdx, %rcx -movq %r8, %rdx -shrq $8, %rdx -andl $67108863, %ecx -andl $67108863, %edx -movl %ecx, 84(%r10) -movq %r8, %rcx -movl %edx, 88(%r10) -movq %rdi, %rdx -shrq $34, %rcx -shlq $10, %rdx -orq %rdx, %rcx -movq %rdi, %rdx -shrq $16, %rdx -andl $67108863, %ecx -movl %ecx, 92(%r10) -movl %edx, 96(%r10) -cmpq $16, %r12 -jbe poly1305_init_ext_avx2_7 -poly1305_init_ext_avx2_2: -movq %r9, %rax -lea (%rdi,%rdi,4), %r14 -mulq %r9 -shlq $2, %r14 -movq %rax, %r11 -movq %rdx, %r15 -lea (%r8,%r8), %rax -mulq %r14 -addq %rax, %r11 -lea (%r9,%r9), %rax -movq %r11, %rsi -adcq %rdx, %r15 -mulq %r8 -movq %rax, %rbx -movq %r14, %rax -movq %rdx, %rcx -lea (%rdi,%rdi), %r14 -mulq %rdi -addq %rax, %rbx -movq %r8, %rax -adcq %rdx, %rcx -mulq %r8 -shlq $20, %r15 -movq %rax, %r13 -shrq $44, %rsi -movq %r9, %rax -orq %rsi, %r15 -movq %rdx, %rsi -mulq %r14 -addq %r15, %rbx -movq %rbx, %r15 -adcq $0, %rcx -addq %rax, %r13 -adcq %rdx, %rsi -shlq $20, %rcx -shrq $44, %r15 -orq %r15, %rcx -addq %rcx, %r13 -movq $0xfffffffffff, %rcx -movq %r13, %rdx -adcq $0, %rsi -andq %rcx, %r11 -shlq $22, %rsi -andq %rcx, %rbx -shrq $42, %rdx -orq %rdx, %rsi -lea (%rsi,%rsi,4), %rsi -addq %rsi, %r11 -movq %rcx, %rsi -andq %r11, %rsi -shrq $44, %r11 -addq %r11, %rbx -movq $0x3ffffffffff, %r11 -andq %rbx, %rcx -andq %r11, %r13 -shrq $44, %rbx -movq %rsi, %r11 -movq %rcx, %rdx -addq %r13, %rbx -shrq $26, %r11 -movq %rbx, %r15 -shlq $18, %rdx -movq %rcx, %r14 -orq %rdx, %r11 -movq %rcx, %rdx -shrq $34, %rdx -movl %esi, %r13d -shlq $10, %r15 -andl $67108863, %r13d -orq %r15, %rdx -andl $67108863, %r11d -shrq $8, %r14 -andl $67108863, %edx -movl %edx, 112(%r10) -movq %rbx, %rdx -shrq $16, %rdx -andl $67108863, %r14d -movl %r13d, 100(%r10) -movl %r11d, 104(%r10) -movl %r14d, 108(%r10) -movl %edx, 116(%r10) -cmpq $48, %r12 -jbe poly1305_init_ext_avx2_4 -poly1305_init_ext_avx2_3: -movq %rsi, %rax -lea (%rbx,%rbx,4), %r15 -mulq %rsi -shlq $2, %r15 -movq %rax, %r13 -movq %rdx, %r12 -lea (%rcx,%rcx), %rax -mulq %r15 -addq %rax, %r13 -lea (%rsi,%rsi), %rax -movq %r15, -16(%rsp) -adcq %rdx, %r12 -mulq %rcx -movq %rax, %r14 -movq %rbx, %rax -movq %rdx, %r11 -mulq %r15 -addq %rax, %r14 -movq %rcx, %rax -movq %r13, %r15 -adcq %rdx, %r11 -mulq %rcx -shlq $20, %r12 -shrq $44, %r15 -orq %r15, %r12 -movq %rax, %r15 -addq %r12, %r14 -movq %rdx, %r12 -movq %rsi, %rax -lea (%rbx,%rbx), %rdx -adcq $0, %r11 -mulq %rdx -addq %rax, %r15 -adcq %rdx, %r12 -movq %r14, %rdx -shlq $20, %r11 -shrq $44, %rdx -orq %rdx, %r11 -addq %r11, %r15 -movq $0xfffffffffff, %r11 -movq %r15, %rdx -adcq $0, %r12 -andq %r11, %r13 -shlq $22, %r12 -andq %r11, %r14 -shrq $42, %rdx -orq %rdx, %r12 -lea (%r12,%r12,4), %r12 -addq %r12, %r13 -movq %r11, %r12 -andq %r13, %r12 -shrq $44, %r13 -addq %r13, %r14 -movq $0x3ffffffffff, %r13 -andq %r14, %r11 -andq %r13, %r15 -shrq $44, %r14 -movq %r11, %rdx -shlq $18, %rdx -addq %r14, %r15 -movl %r12d, %r14d -movq %r11, %r13 -shrq $26, %r12 -andl $67108863, %r14d -orq %rdx, %r12 -movq %r15, %rdx -shrq $34, %r11 -shlq $10, %rdx -andl $67108863, %r12d -orq %rdx, %r11 -shrq $8, %r13 -andl $67108863, %r11d -movl %r11d, 152(%r10) -andl $67108863, %r13d -shrq $16, %r15 -movl %r14d, 140(%r10) -movl %r12d, 144(%r10) -movl %r13d, 148(%r10) -movl %r15d, 156(%r10) -movq -16(%rsp), %r11 -jmp poly1305_init_ext_avx2_6 -poly1305_init_ext_avx2_4: -cmpq $32, %r12 -jbe poly1305_init_ext_avx2_7 -poly1305_init_ext_avx2_5: -lea (%rbx,%rbx,4), %r11 -shlq $2, %r11 -poly1305_init_ext_avx2_6: -movq %r9, %rax -lea (%rcx,%rcx,4), %r13 -mulq %rsi -shlq $2, %r13 -movq %rax, %r14 -movq %rdi, %rax -movq %rdx, %r12 -mulq %r13 -addq %rax, %r14 -movq %r8, %rax -adcq %rdx, %r12 -mulq %r11 -addq %rax, %r14 -movq %r8, %rax -adcq %rdx, %r12 -mulq %rsi -movq %rax, %r15 -movq %r9, %rax -movq %rdx, %r13 -mulq %rcx -addq %rax, %r15 -movq %r11, %rax -movq %r14, %r11 -adcq %rdx, %r13 -mulq %rdi -addq %rax, %r15 -movq %rdi, %rax -adcq %rdx, %r13 -mulq %rsi -shlq $20, %r12 -movq %rax, %rsi -shrq $44, %r11 -movq %r8, %rax -orq %r11, %r12 -movq %rdx, %rdi -mulq %rcx -addq %r12, %r15 -movq %r15, %rcx -adcq $0, %r13 -addq %rax, %rsi -movq %r9, %rax -movq $0xfffffffffff, %r9 -adcq %rdx, %rdi -andq %r9, %r14 -mulq %rbx -addq %rax, %rsi -adcq %rdx, %rdi -movq %r9, %rdx -shlq $20, %r13 -andq %r9, %r15 -shrq $44, %rcx -orq %rcx, %r13 -addq %r13, %rsi -movq %rsi, %rbx -adcq $0, %rdi -shlq $22, %rdi -shrq $42, %rbx -orq %rbx, %rdi -lea (%rdi,%rdi,4), %r8 -addq %r8, %r14 -andq %r14, %rdx -shrq $44, %r14 -addq %r14, %r15 -movq $0x3ffffffffff, %r14 -andq %r15, %r9 -andq %r14, %rsi -shrq $44, %r15 -movq %r9, %rax -addq %r15, %rsi -movl %edx, %r15d -movq %rsi, %rbx -movq %r9, %rcx -shrq $26, %rdx -andl $67108863, %r15d -shlq $18, %rax -shrq $34, %r9 -orq %rax, %rdx -shlq $10, %rbx -shrq $8, %rcx -orq %rbx, %r9 -shrq $16, %rsi -andl $67108863, %edx -andl $67108863, %ecx -andl $67108863, %r9d -movl %r15d, 120(%r10) -movl %edx, 124(%r10) -movl %ecx, 128(%r10) -movl %r9d, 132(%r10) -movl %esi, 136(%r10) -poly1305_init_ext_avx2_7: -movq $0, 176(%r10) -vzeroupper -popq %rbx -popq %r15 -popq %r14 -popq %r13 -popq %r12 -ret -FN_END poly1305_init_ext_avx2 - diff --git a/src/libcryptobox/poly1305/constants.S b/src/libcryptobox/poly1305/constants.S deleted file mode 100644 index a4797a2aa..000000000 --- a/src/libcryptobox/poly1305/constants.S +++ /dev/null @@ -1,21 +0,0 @@ -SECTION_RODATA - -.p2align 4 -poly1305_constants_x86: -/* 0 */ poly1305_x86_scale: .long 0x0,0x37f40000 -/* 8 */ poly1305_x86_two32: .long 0x0,0x41f00000 -/* 16 */ poly1305_x86_two64: .long 0x0,0x43f00000 -/* 24 */ poly1305_x86_two96: .long 0x0,0x45f00000 -/* 32 */ poly1305_x86_alpha32: .long 0x0,0x45e80000 -/* 40 */ poly1305_x86_alpha64: .long 0x0,0x47e80000 -/* 48 */ poly1305_x86_alpha96: .long 0x0,0x49e80000 -/* 56 */ poly1305_x86_alpha130: .long 0x0,0x4c080000 -/* 64 */ poly1305_x86_doffset0: .long 0x0,0x43300000 -/* 72 */ poly1305_x86_doffset1: .long 0x0,0x45300000 -/* 80 */ poly1305_x86_doffset2: .long 0x0,0x47300000 -/* 88 */ poly1305_x86_doffset3: .long 0x0,0x49300000 -/* 96 */ poly1305_x86_doffset3minustwo128: .long 0x0,0x492ffffe -/* 104 */ poly1305_x86_hoffset0: .long 0xfffffffb,0x43300001 -/* 112 */ poly1305_x86_hoffset1: .long 0xfffffffe,0x45300001 -/* 120 */ poly1305_x86_hoffset2: .long 0xfffffffe,0x47300001 -/* 124 */ poly1305_x86_hoffset3: .long 0xfffffffe,0x49300003 diff --git a/src/libcryptobox/poly1305/poly1305.c b/src/libcryptobox/poly1305/poly1305.c deleted file mode 100644 index 4adea30af..000000000 --- a/src/libcryptobox/poly1305/poly1305.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2015, Vsevolod Stakhov - * Copyright (c) 2015, Andrew Moon - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "cryptobox.h" -#include "poly1305.h" -#include "platform_config.h" - -extern unsigned long cpu_config; - -typedef struct poly1305_state_internal_t -{ - unsigned char opaque[192]; /* largest state required (AVX2) */ - size_t leftover, block_size; - unsigned char buffer[64]; /* largest blocksize (AVX2) */ -} poly1305_state_internal; - -typedef struct poly1305_impl_t -{ - unsigned long cpu_flags; - const char *desc; - - size_t (*block_size)(void); - void (*init_ext)(void *state, const poly1305_key *key, size_t bytes_hint); - void (*blocks)(void *state, const unsigned char *in, size_t inlen); - void (*finish_ext)(void *state, const unsigned char *in, size_t remaining, - unsigned char *mac); - void (*auth)(unsigned char *mac, const unsigned char *in, size_t inlen, - const poly1305_key *key); -} poly1305_impl_t; - -#define POLY1305_DECLARE(ext) \ - size_t poly1305_block_size_##ext(void); \ - void poly1305_init_ext_##ext(void *state, const poly1305_key *key, size_t bytes_hint); \ - void poly1305_blocks_##ext(void *state, const unsigned char *in, size_t inlen); \ - void poly1305_finish_ext_##ext(void *state, const unsigned char *in, size_t remaining, unsigned char *mac); \ - void poly1305_auth_##ext(unsigned char *mac, const unsigned char *m, size_t inlen, const poly1305_key *key); - -#define POLY1305_IMPL(cpuflags, desc, ext) \ - {(cpuflags), desc, poly1305_block_size_##ext, poly1305_init_ext_##ext, poly1305_blocks_##ext, poly1305_finish_ext_##ext, poly1305_auth_##ext} - -#if defined(HAVE_AVX2) -POLY1305_DECLARE(avx2) -#define POLY1305_AVX2 POLY1305_IMPL(CPUID_AVX2, "avx2", avx2) -#endif -#if defined(HAVE_AVX) -POLY1305_DECLARE(avx) -#define POLY1305_AVX POLY1305_IMPL(CPUID_AVX, "avx", avx) -#endif -#if defined(HAVE_SSE2) -POLY1305_DECLARE(sse2) -#define POLY1305_SSE2 POLY1305_IMPL(CPUID_SSE2, "sse2", sse2) -#endif - -POLY1305_DECLARE(ref) -#define POLY1305_GENERIC POLY1305_IMPL(0, "generic", ref) - -/* list implementations from most optimized to least, with generic as the last entry */ -static const poly1305_impl_t poly1305_list[] = -{ -POLY1305_GENERIC, - -#if defined(POLY1305_AVX2) - POLY1305_AVX2, -#endif -#if defined(POLY1305_AVX) - POLY1305_AVX, -#endif -#if defined(POLY1305_SSE2) - POLY1305_SSE2, -#endif -}; - -static const poly1305_impl_t *poly1305_opt = &poly1305_list[0]; - -/* is the pointer aligned on a word boundary? */ -static int poly1305_is_aligned(const void *p) -{ - return ((size_t) p & (sizeof(size_t) - 1)) == 0; -} - -const char* -poly1305_load(void) -{ - guint i; - - if (cpu_config != 0) { - for (i = 0; i < G_N_ELEMENTS(poly1305_list); i++) { - if (poly1305_list[i].cpu_flags & cpu_config) { - poly1305_opt = &poly1305_list[i]; - break; - } - } - } - - return poly1305_opt->desc; -} - -/* processes inlen bytes (full blocks only), handling input alignment */ -static void poly1305_consume(poly1305_state_internal *state, - const unsigned char *in, size_t inlen) -{ - int in_aligned; - - /* it's ok to call with 0 bytes */ - if (!inlen) - return; - - /* if everything is aligned, handle directly */ - in_aligned = poly1305_is_aligned (in); - if (in_aligned) { - poly1305_opt->blocks (state->opaque, in, inlen); - return; - } - - /* copy the unaligned data to an aligned buffer and process in chunks */ - while (inlen) { - unsigned char buffer[1024]; - const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen; - memcpy (buffer, in, bytes); - poly1305_opt->blocks (state->opaque, buffer, bytes); - in += bytes; - inlen -= bytes; - } -} - -void poly1305_init(poly1305_state *S, const poly1305_key *key) -{ - poly1305_state_internal *state = (poly1305_state_internal *) S; - poly1305_opt->init_ext (state->opaque, key, 0); - state->leftover = 0; - state->block_size = poly1305_opt->block_size (); -} - -void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, - size_t bytes_hint) -{ - poly1305_state_internal *state = (poly1305_state_internal *) S; - poly1305_opt->init_ext (state->opaque, key, bytes_hint); - state->leftover = 0; - state->block_size = poly1305_opt->block_size (); -} - -void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen) -{ - poly1305_state_internal *state = (poly1305_state_internal *) S; - - /* handle leftover */ - if (state->leftover) { - size_t want = (state->block_size - state->leftover); - if (want > inlen) - want = inlen; - memcpy (state->buffer + state->leftover, in, want); - inlen -= want; - in += want; - state->leftover += want; - if (state->leftover < state->block_size) - return; - poly1305_opt->blocks (state->opaque, state->buffer, state->block_size); - state->leftover = 0; - } - - /* process full blocks */ - if (inlen >= state->block_size) { - size_t want = (inlen & ~(state->block_size - 1)); - poly1305_consume (state, in, want); - in += want; - inlen -= want; - } - - /* store leftover */ - if (inlen) { - memcpy (state->buffer + state->leftover, in, inlen); - state->leftover += inlen; - } -} - -void poly1305_finish(poly1305_state *S, unsigned char *mac) -{ - poly1305_state_internal *state = (poly1305_state_internal *) S; - poly1305_opt->finish_ext (state->opaque, state->buffer, state->leftover, - mac); -} - -void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, - const poly1305_key *key) -{ - poly1305_opt->auth (mac, in, inlen, key); -} - -int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]) -{ - size_t i; - unsigned int dif = 0; - - for (i = 0; i < 16; i++) { - dif |= (mac1[i] ^ mac2[i]); - } - - dif = (dif - 1) >> ((sizeof(unsigned int) * 8) - 1); - return (dif & 1); -} diff --git a/src/libcryptobox/poly1305/poly1305.h b/src/libcryptobox/poly1305/poly1305.h deleted file mode 100644 index 902a9c288..000000000 --- a/src/libcryptobox/poly1305/poly1305.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef POLY1305_H -#define POLY1305_H - -#include <stddef.h> - -#if defined(__cplusplus) -extern "C" -{ -#endif - -typedef struct poly1305_state -{ - unsigned char opaque[320]; -} poly1305_state; - -typedef struct poly1305_key -{ - unsigned char b[32]; -} poly1305_key; - -void poly1305_init(poly1305_state *S, const poly1305_key *key); -void poly1305_init_ext(poly1305_state *S, const poly1305_key *key, - size_t bytes_hint); -void poly1305_update(poly1305_state *S, const unsigned char *in, size_t inlen); -void poly1305_finish(poly1305_state *S, unsigned char *mac); - -void poly1305_auth(unsigned char *mac, const unsigned char *in, size_t inlen, - const poly1305_key *key); -int poly1305_verify(const unsigned char mac1[16], const unsigned char mac2[16]); - -const char* poly1305_load(void); - -#if defined(__cplusplus) -} -#endif - -#endif /* POLY1305_H */ - diff --git a/src/libcryptobox/poly1305/poly1305_internal.h b/src/libcryptobox/poly1305/poly1305_internal.h deleted file mode 100644 index 21b7aa7d2..000000000 --- a/src/libcryptobox/poly1305/poly1305_internal.h +++ /dev/null @@ -1,19 +0,0 @@ -#if defined(_MSC_VER) - #include <intrin.h> - - typedef struct uint128_t { - unsigned long long lo; - unsigned long long hi; - } uint128_t; - - #define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #pragma GCC system_header - #if defined(__SIZEOF_INT128__) - typedef unsigned __int128 uint128_t; - #else - typedef unsigned uint128_t __attribute__((mode(TI))); - #endif - - #define POLY1305_NOINLINE __attribute__((noinline)) -#endif diff --git a/src/libcryptobox/poly1305/ref-32.c b/src/libcryptobox/poly1305/ref-32.c deleted file mode 100644 index 9f0ea998b..000000000 --- a/src/libcryptobox/poly1305/ref-32.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication and 64 bit addition - - assumes the existence of uint32_t and uint64_t -*/ - -#include "config.h" -#include "poly1305.h" - -enum { - POLY1305_BLOCK_SIZE = 16 -}; - -typedef struct poly1305_state_ref_t { - uint32_t r[5]; - uint32_t h[5]; - uint32_t pad[4]; - unsigned char final; -} poly1305_state_ref_t; - -/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ -static uint32_t -U8TO32(const unsigned char *p) { - return - (((uint32_t)(p[0] & 0xff) ) | - ((uint32_t)(p[1] & 0xff) << 8) | - ((uint32_t)(p[2] & 0xff) << 16) | - ((uint32_t)(p[3] & 0xff) << 24)); -} - -/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ -static void -U32TO8(unsigned char *p, uint32_t v) { - p[0] = (unsigned char)((v ) & 0xff); - p[1] = (unsigned char)((v >> 8) & 0xff); - p[2] = (unsigned char)((v >> 16) & 0xff); - p[3] = (unsigned char)((v >> 24) & 0xff); -} - -size_t -poly1305_block_size_ref(void) { - return POLY1305_BLOCK_SIZE; -} - -void -poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - - /* bytes_hint not used */ - (void)bytes_hint; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - st->r[0] = (U8TO32(&key->b[ 0]) ) & 0x3ffffff; - st->r[1] = (U8TO32(&key->b[ 3]) >> 2) & 0x3ffff03; - st->r[2] = (U8TO32(&key->b[ 6]) >> 4) & 0x3ffc0ff; - st->r[3] = (U8TO32(&key->b[ 9]) >> 6) & 0x3f03fff; - st->r[4] = (U8TO32(&key->b[12]) >> 8) & 0x00fffff; - - /* h = 0 */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->h[3] = 0; - st->h[4] = 0; - - /* save pad for later */ - st->pad[0] = U8TO32(&key->b[16]); - st->pad[1] = U8TO32(&key->b[20]); - st->pad[2] = U8TO32(&key->b[24]); - st->pad[3] = U8TO32(&key->b[28]); - - st->final = 0; -} - -void -poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - const uint32_t hibit = (st->final) ? 0 : (1 << 24); /* 1 << 128 */ - uint32_t r0,r1,r2,r3,r4; - uint32_t s1,s2,s3,s4; - uint32_t h0,h1,h2,h3,h4; - uint64_t d0,d1,d2,d3,d4; - uint32_t c; - - r0 = st->r[0]; - r1 = st->r[1]; - r2 = st->r[2]; - r3 = st->r[3]; - r4 = st->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - while (inlen >= POLY1305_BLOCK_SIZE) { - /* h += m[i] */ - h0 += (U8TO32(in+ 0) ) & 0x3ffffff; - h1 += (U8TO32(in+ 3) >> 2) & 0x3ffffff; - h2 += (U8TO32(in+ 6) >> 4) & 0x3ffffff; - h3 += (U8TO32(in+ 9) >> 6) & 0x3ffffff; - h4 += (U8TO32(in+12) >> 8) | hibit; - - /* h *= r */ - d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + ((uint64_t)h4 * s1); - d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + ((uint64_t)h4 * s2); - d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + ((uint64_t)h4 * s3); - d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + ((uint64_t)h4 * s4); - d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + ((uint64_t)h4 * r0); - - /* (partial) h %= p */ - c = (uint32_t)(d0 >> 26); h0 = (uint32_t)d0 & 0x3ffffff; - d1 += c; c = (uint32_t)(d1 >> 26); h1 = (uint32_t)d1 & 0x3ffffff; - d2 += c; c = (uint32_t)(d2 >> 26); h2 = (uint32_t)d2 & 0x3ffffff; - d3 += c; c = (uint32_t)(d3 >> 26); h3 = (uint32_t)d3 & 0x3ffffff; - d4 += c; c = (uint32_t)(d4 >> 26); h4 = (uint32_t)d4 & 0x3ffffff; - h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; - h1 += c; - - in += POLY1305_BLOCK_SIZE; - inlen -= POLY1305_BLOCK_SIZE; - } - - st->h[0] = h0; - st->h[1] = h1; - st->h[2] = h2; - st->h[3] = h3; - st->h[4] = h4; -} - -void -poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - uint32_t h0,h1,h2,h3,h4,c; - uint32_t g0,g1,g2,g3,g4; - uint64_t f; - uint32_t mask; - - /* process the remaining block */ - if (remaining) { - unsigned char final[POLY1305_BLOCK_SIZE] = {0}; - size_t i; - for (i = 0; i < remaining; i++) - final[i] = in[i]; - final[remaining] = 1; - st->final = 1; - poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); - } - - /* fully carry h */ - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - c = h1 >> 26; h1 = h1 & 0x3ffffff; - h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; - h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; - h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; - h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; - h1 += c; - - /* compute h + -p */ - g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; - g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; - g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; - g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; - g4 = h4 + c - (1 << 26); - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; - h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; - h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; - h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; - - /* mac = (h + pad) % (2^128) */ - f = (uint64_t)h0 + st->pad[0] ; h0 = (uint32_t)f; - f = (uint64_t)h1 + st->pad[1] + (f >> 32); h1 = (uint32_t)f; - f = (uint64_t)h2 + st->pad[2] + (f >> 32); h2 = (uint32_t)f; - f = (uint64_t)h3 + st->pad[3] + (f >> 32); h3 = (uint32_t)f; - - U32TO8(mac + 0, h0); - U32TO8(mac + 4, h1); - U32TO8(mac + 8, h2); - U32TO8(mac + 12, h3); - - /* zero out the state */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->h[3] = 0; - st->h[4] = 0; - st->r[0] = 0; - st->r[1] = 0; - st->r[2] = 0; - st->r[3] = 0; - st->r[4] = 0; - st->pad[0] = 0; - st->pad[1] = 0; - st->pad[2] = 0; - st->pad[3] = 0; -} - -void -poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { - poly1305_state_ref_t st; - size_t blocks; - poly1305_init_ext_ref(&st, key, inlen); - blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); - if (blocks) { - poly1305_blocks_ref(&st, in, blocks); - in += blocks; - inlen -= blocks; - } - poly1305_finish_ext_ref(&st, in, inlen, mac); -} - diff --git a/src/libcryptobox/poly1305/ref-64.c b/src/libcryptobox/poly1305/ref-64.c deleted file mode 100644 index cceb1476d..000000000 --- a/src/libcryptobox/poly1305/ref-64.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication and 128 bit addition - - assumes the existence of uint64_t and uint128_t -*/ - -#include "config.h" -#include "poly1305.h" -#include "poly1305_internal.h" - -#define POLY1305_BLOCK_SIZE 16 - -typedef struct poly1305_state_ref_t { - uint64_t r[3]; - uint64_t h[3]; - uint64_t pad[2]; - unsigned char final; -} poly1305_state_ref_t; - -/* interpret eight 8 bit unsigned integers as a 64 bit unsigned integer in little endian */ -static uint64_t -U8TO64(const unsigned char *p) { - return - ((uint64_t)p[0] ) | - ((uint64_t)p[1] << 8) | - ((uint64_t)p[2] << 16) | - ((uint64_t)p[3] << 24) | - ((uint64_t)p[4] << 32) | - ((uint64_t)p[5] << 40) | - ((uint64_t)p[6] << 48) | - ((uint64_t)p[7] << 56); -} - -/* store a 64 bit unsigned integer as eight 8 bit unsigned integers in little endian */ -static void -U64TO8(unsigned char *p, uint64_t v) { - p[0] = (unsigned char)(v ) & 0xff; - p[1] = (unsigned char)(v >> 8) & 0xff; - p[2] = (unsigned char)(v >> 16) & 0xff; - p[3] = (unsigned char)(v >> 24) & 0xff; - p[4] = (unsigned char)(v >> 32) & 0xff; - p[5] = (unsigned char)(v >> 40) & 0xff; - p[6] = (unsigned char)(v >> 48) & 0xff; - p[7] = (unsigned char)(v >> 56) & 0xff; -} - -size_t -poly1305_block_size_ref(void) { - return POLY1305_BLOCK_SIZE; -} - -void -poly1305_init_ext_ref(void *state, const poly1305_key *key, size_t bytes_hint) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - uint64_t t0, t1; - - /* bytes_hint not used */ - (void)bytes_hint; - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - t0 = U8TO64(&key->b[0]); - t1 = U8TO64(&key->b[8]); - st->r[0] = ( t0 ) & 0xffc0fffffff; - st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; - st->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; - - /* h = 0 */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - - /* save pad for later */ - st->pad[0] = U8TO64(&key->b[16]); - st->pad[1] = U8TO64(&key->b[24]); - - st->final = 0; -} - -void -poly1305_blocks_ref(void *state, const unsigned char *in, size_t inlen) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - const uint64_t hibit = (st->final) ? 0 : ((uint64_t)1 << 40); /* 1 << 128 */ - uint64_t r0,r1,r2; - uint64_t s1,s2; - uint64_t h0,h1,h2; - uint64_t c; - uint128_t d0,d1,d2; - - r0 = st->r[0]; - r1 = st->r[1]; - r2 = st->r[2]; - - s1 = r1 * (5 << 2); - s2 = r2 * (5 << 2); - - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - - while (inlen >= POLY1305_BLOCK_SIZE) { - uint64_t t0, t1; - - /* h += in[i] */ - t0 = U8TO64(in + 0); - t1 = U8TO64(in + 8); - h0 += (( t0 ) & 0xfffffffffff); - h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); - h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; - - /* h *= r */ - d0 = ((uint128_t)h0 * r0) + ((uint128_t)h1 * s2) + ((uint128_t)h2 * s1); - d1 = ((uint128_t)h0 * r1) + ((uint128_t)h1 * r0) + ((uint128_t)h2 * s2); - d2 = ((uint128_t)h0 * r2) + ((uint128_t)h1 * r1) + ((uint128_t)h2 * r0); - - /* (partial) h %= p */ - c = (uint64_t)(d0 >> 44); h0 = (uint64_t)d0 & 0xfffffffffff; - d1 += c; c = (uint64_t)(d1 >> 44); h1 = (uint64_t)d1 & 0xfffffffffff; - d2 += c; c = (uint64_t)(d2 >> 42); h2 = (uint64_t)d2 & 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; - h1 += c; - - in += POLY1305_BLOCK_SIZE; - inlen -= POLY1305_BLOCK_SIZE; - } - - st->h[0] = h0; - st->h[1] = h1; - st->h[2] = h2; -} - -void -poly1305_finish_ext_ref(void *state, const unsigned char *in, size_t remaining, unsigned char mac[16]) { - poly1305_state_ref_t *st = (poly1305_state_ref_t *)state; - uint64_t h0, h1, h2, c; - uint64_t g0, g1, g2; - uint64_t t0, t1; - - /* process the remaining block */ - if (remaining) { - unsigned char final[POLY1305_BLOCK_SIZE] = {0}; - size_t i; - for (i = 0; i < remaining; i++) - final[i] = in[i]; - final[remaining] = 1; - st->final = 1; - poly1305_blocks_ref(st, final, POLY1305_BLOCK_SIZE); - } - - /* fully carry h */ - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - - c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += c; - - /* compute h + -p */ - g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; - g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; - g2 = h2 + c - ((uint64_t)1 << 42); - - /* select h if h < p, or h + -p if h >= p */ - c = (g2 >> 63) - 1; - h0 = (h0 & ~c) | (g0 & c); - h1 = (h1 & ~c) | (g1 & c); - h2 = (h2 & ~c) | (g2 & c); - - /* h = (h + pad) */ - t0 = st->pad[0]; - t1 = st->pad[1]; - - h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; h2 &= 0x3ffffffffff; - - /* mac = h % (2^128) */ - h0 = ((h0 ) | (h1 << 44)); - h1 = ((h1 >> 20) | (h2 << 24)); - - U64TO8(&mac[0], h0); - U64TO8(&mac[8], h1); - - /* zero out the state */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->r[0] = 0; - st->r[1] = 0; - st->r[2] = 0; - st->pad[0] = 0; - st->pad[1] = 0; -} - - -void -poly1305_auth_ref(unsigned char mac[16], const unsigned char *in, size_t inlen, const poly1305_key *key) { - poly1305_state_ref_t st; - size_t blocks; - poly1305_init_ext_ref(&st, key, inlen); - blocks = (inlen & ~(POLY1305_BLOCK_SIZE - 1)); - if (blocks) { - poly1305_blocks_ref(&st, in, blocks); - in += blocks; - inlen -= blocks; - } - poly1305_finish_ext_ref(&st, in, inlen, mac); -} - diff --git a/src/libcryptobox/poly1305/sse2.S b/src/libcryptobox/poly1305/sse2.S deleted file mode 100644 index 038961899..000000000 --- a/src/libcryptobox/poly1305/sse2.S +++ /dev/null @@ -1,969 +0,0 @@ -#include "../macro.S" -#include "constants.S" - -SECTION_TEXT - -GLOBAL_HIDDEN_FN poly1305_block_size_sse2 -movl $32, %eax -ret -FN_END poly1305_block_size_sse2 - -GLOBAL_HIDDEN_FN poly1305_init_ext_sse2 -poly1305_init_ext_sse2_local: -pushq %r15 -xorps %xmm0, %xmm0 -testq %rdx, %rdx -pushq %r14 -movq %rdx, %r11 -movq $-1, %rax -cmove %rax, %r11 -pushq %r13 -movabsq $17575274610687, %r9 -pushq %r12 -pushq %rbp -movq %r11, %r13 -movabsq $17592186044415, %rbp -pushq %rbx -xorl %ebx, %ebx -movdqu %xmm0, 32(%rdi) -movdqu %xmm0, (%rdi) -movdqu %xmm0, 16(%rdi) -movq 8(%rsi), %rcx -movq (%rsi), %rax -movq %rcx, %rdx -shrq $24, %rcx -andq %rax, %r9 -salq $20, %rdx -shrq $44, %rax -movq %r9, %r8 -orq %rax, %rdx -shrq $26, %r8 -movabsq $17592181915647, %rax -andq %rax, %rdx -movabsq $68719475727, %rax -andq %rax, %rcx -movl %r9d, %eax -andl $67108863, %eax -movl %eax, 40(%rdi) -movl %edx, %eax -sall $18, %eax -orl %r8d, %eax -movq %rdx, %r8 -andl $67108863, %eax -shrq $34, %r8 -movl %eax, 44(%rdi) -movq %rdx, %rax -shrq $8, %rax -andl $67108863, %eax -movl %eax, 48(%rdi) -movl %ecx, %eax -sall $10, %eax -orl %r8d, %eax -movq %rdi, %r8 -andl $67108863, %eax -movl %eax, 52(%rdi) -movq %rcx, %rax -shrq $16, %rax -movl %eax, 56(%rdi) -movq 16(%rsi), %rax -movq %rax, 104(%rdi) -movq 24(%rsi), %rax -movq %rdx, %rsi -movq %rax, 112(%rdi) -poly1305_init_ext_sse2_7: -testq %rbx, %rbx -jne poly1305_init_ext_sse2_4 -cmpq $16, %r13 -jbe poly1305_init_ext_sse2_5 -leaq 60(%r8), %rdi -jmp poly1305_init_ext_sse2_6 -poly1305_init_ext_sse2_4: -cmpq $96, %r13 -jb poly1305_init_ext_sse2_5 -leaq 80(%r8), %rdi -poly1305_init_ext_sse2_6: -imulq $20, %rcx, %r10 -movq $0, -48(%rsp) -movq $0, -32(%rsp) -leaq (%rsi,%rsi), %r14 -leaq (%r9,%r9), %r11 -movq %r10, %rax -mulq %r14 -movq %rax, %r14 -movq %r9, %rax -movq %rdx, %r15 -mulq %r9 -addq %rax, %r14 -movq %r14, %rax -adcq %rdx, %r15 -leaq (%rcx,%rcx), %rdx -andq %rbp, %rax -movq %rax, -16(%rsp) -movq %r11, %rax -movq %rdx, -24(%rsp) -mulq %rsi -movq %rax, %r11 -movq %r10, %rax -movq %rdx, %r12 -mulq %rcx -movq -16(%rsp), %rcx -addq %rax, %r11 -movq %r14, %rax -adcq %rdx, %r12 -shrdq $44, %r15, %rax -movq %rax, -56(%rsp) -movq -24(%rsp), %rax -addq -56(%rsp), %r11 -adcq -48(%rsp), %r12 -mulq %r9 -movq %r11, %r14 -andq %rbp, %r14 -movq %rax, %r9 -movq %rsi, %rax -movq %rdx, %r10 -mulq %rsi -addq %rax, %r9 -movq %r11, %rax -adcq %rdx, %r10 -shrdq $44, %r12, %rax -movq %rax, -40(%rsp) -movabsq $4398046511103, %rax -addq -40(%rsp), %r9 -adcq -32(%rsp), %r10 -andq %r9, %rax -incq %rbx -shrdq $42, %r10, %r9 -leaq (%r9,%r9,4), %r9 -addq %r9, %rcx -movq %rcx, %r9 -shrq $44, %rcx -addq %r14, %rcx -andq %rbp, %r9 -movq %rcx, %rsi -shrq $44, %rcx -movq %r9, %rdx -addq %rax, %rcx -movl %r9d, %eax -andq %rbp, %rsi -andl $67108863, %eax -shrq $26, %rdx -movl %eax, (%rdi) -movl %esi, %eax -sall $18, %eax -orl %edx, %eax -movq %rsi, %rdx -andl $67108863, %eax -shrq $34, %rdx -movl %eax, 4(%rdi) -movq %rsi, %rax -shrq $8, %rax -andl $67108863, %eax -movl %eax, 8(%rdi) -movl %ecx, %eax -sall $10, %eax -orl %edx, %eax -andl $67108863, %eax -movl %eax, 12(%rdi) -movq %rcx, %rax -shrq $16, %rax -cmpq $2, %rbx -movl %eax, 16(%rdi) -jne poly1305_init_ext_sse2_7 -poly1305_init_ext_sse2_5: -movq $0, 120(%r8) -popq %rbx -popq %rbp -popq %r12 -popq %r13 -popq %r14 -popq %r15 -ret -FN_END poly1305_init_ext_sse2 - - -GLOBAL_HIDDEN_FN poly1305_blocks_sse2 -poly1305_blocks_sse2_local: -pushq %rbp -movq %rsp, %rbp -pushq %rbx -andq $-64, %rsp -subq $328, %rsp -movq $(1 << 24), %rax -movd %rax, %xmm1 -movq $((1 << 26) - 1), %rax -movd %rax, %xmm0 -pshufd $68, %xmm1, %xmm1 -pshufd $68, %xmm0, %xmm0 -movq 120(%rdi), %rax -movaps %xmm1, 312(%rsp) -testb $4, %al -je poly1305_blocks_sse2_11 -movaps 312(%rsp), %xmm1 -psrldq $8, %xmm1 -movaps %xmm1, 312(%rsp) -poly1305_blocks_sse2_11: -testb $8, %al -je poly1305_blocks_sse2_12 -xorps %xmm1, %xmm1 -movaps %xmm1, 312(%rsp) -poly1305_blocks_sse2_12: -testb $1, %al -jne poly1305_blocks_sse2_13 -movq 16(%rsi), %xmm1 -movaps %xmm0, %xmm3 -movaps %xmm0, %xmm9 -movq (%rsi), %xmm15 -orq $1, %rax -subq $32, %rdx -movq 8(%rsi), %xmm12 -punpcklqdq %xmm1, %xmm15 -movq 24(%rsi), %xmm1 -movaps %xmm15, %xmm8 -pand %xmm15, %xmm3 -psrlq $52, %xmm15 -addq $32, %rsi -punpcklqdq %xmm1, %xmm12 -movaps %xmm12, %xmm1 -psrlq $26, %xmm8 -psllq $12, %xmm1 -pand %xmm0, %xmm8 -movq %rax, 120(%rdi) -por %xmm1, %xmm15 -psrlq $40, %xmm12 -pand %xmm15, %xmm9 -por 312(%rsp), %xmm12 -psrlq $26, %xmm15 -pand %xmm0, %xmm15 -jmp poly1305_blocks_sse2_14 -poly1305_blocks_sse2_13: -movdqu (%rdi), %xmm8 -movdqu 16(%rdi), %xmm15 -movdqu 32(%rdi), %xmm12 -pshufd $80, %xmm8, %xmm3 -pshufd $250, %xmm8, %xmm8 -pshufd $80, %xmm15, %xmm9 -pshufd $250, %xmm15, %xmm15 -pshufd $80, %xmm12, %xmm12 -poly1305_blocks_sse2_14: -movq 120(%rdi), %rax -testb $48, %al -je poly1305_blocks_sse2_15 -testb $16, %al -movd 56(%rdi), %xmm2 -leaq 40(%rdi), %rax -je poly1305_blocks_sse2_16 -movdqu 60(%rdi), %xmm1 -movdqu (%rax), %xmm4 -movd %xmm2, %eax -movd 76(%rdi), %xmm2 -movaps %xmm1, %xmm7 -movd %eax, %xmm5 -punpckldq %xmm4, %xmm7 -punpckhdq %xmm4, %xmm1 -punpcklqdq %xmm5, %xmm2 -jmp poly1305_blocks_sse2_17 -poly1305_blocks_sse2_16: -movdqu (%rax), %xmm1 -movl $1, %r8d -movd %r8d, %xmm4 -movaps %xmm1, %xmm7 -punpckldq %xmm4, %xmm7 -punpckhdq %xmm4, %xmm1 -poly1305_blocks_sse2_17: -pshufd $80, %xmm7, %xmm11 -pshufd $80, %xmm1, %xmm4 -pshufd $250, %xmm7, %xmm7 -movaps %xmm11, 168(%rsp) -pshufd $250, %xmm1, %xmm1 -jmp poly1305_blocks_sse2_18 -poly1305_blocks_sse2_15: -movdqu 60(%rdi), %xmm1 -movd 76(%rdi), %xmm2 -pshufd $0, %xmm2, %xmm2 -pshufd $0, %xmm1, %xmm11 -pshufd $85, %xmm1, %xmm7 -pshufd $170, %xmm1, %xmm4 -movaps %xmm11, 168(%rsp) -pshufd $255, %xmm1, %xmm1 -poly1305_blocks_sse2_18: -movaps %xmm1, %xmm14 -movaps %xmm7, %xmm5 -movaps %xmm4, %xmm13 -movaps %xmm1, 264(%rsp) -movaps %xmm2, %xmm1 -cmpq $63, %rdx -movq $(5), %r8 -movd %r8, %xmm6 -pshufd $68, %xmm6, %xmm6 -pmuludq %xmm6, %xmm5 -movaps %xmm4, 296(%rsp) -pmuludq %xmm6, %xmm13 -movaps %xmm2, 152(%rsp) -pmuludq %xmm6, %xmm14 -pmuludq %xmm6, %xmm1 -movaps %xmm5, 88(%rsp) -movaps %xmm13, 72(%rsp) -movaps %xmm14, 56(%rsp) -movaps %xmm1, 40(%rsp) -jbe poly1305_blocks_sse2_19 -movdqu 80(%rdi), %xmm1 -movd 96(%rdi), %xmm2 -movq %rdx, %rcx -pshufd $0, %xmm2, %xmm2 -movaps %xmm2, 24(%rsp) -pmuludq %xmm6, %xmm2 -pshufd $85, %xmm1, %xmm4 -movaps %xmm4, 280(%rsp) -pmuludq %xmm6, %xmm4 -pshufd $255, %xmm1, %xmm13 -pshufd $170, %xmm1, %xmm5 -movaps 72(%rsp), %xmm14 -movaps %xmm5, 216(%rsp) -pmuludq %xmm6, %xmm5 -movq %rsi, %rax -movaps %xmm4, -24(%rsp) -movaps %xmm13, %xmm4 -pshufd $0, %xmm1, %xmm1 -pmuludq %xmm6, %xmm4 -movaps %xmm14, -8(%rsp) -movaps %xmm5, 8(%rsp) -movaps 168(%rsp), %xmm5 -movaps %xmm1, 248(%rsp) -movaps 56(%rsp), %xmm1 -movaps %xmm4, 120(%rsp) -movaps 40(%rsp), %xmm4 -movaps %xmm13, 136(%rsp) -movaps %xmm2, 200(%rsp) -movaps %xmm1, 104(%rsp) -movaps %xmm4, 184(%rsp) -movaps %xmm5, 232(%rsp) -jmp poly1305_blocks_sse2_20 -.p2align 6 -poly1305_blocks_sse2_20: -movaps -24(%rsp), %xmm5 -movaps %xmm8, %xmm13 -subq $64, %rcx -movaps 8(%rsp), %xmm4 -movaps 120(%rsp), %xmm10 -pmuludq %xmm12, %xmm5 -pmuludq %xmm15, %xmm4 -movaps 8(%rsp), %xmm2 -pmuludq %xmm9, %xmm10 -movaps 120(%rsp), %xmm11 -movaps 200(%rsp), %xmm14 -pmuludq %xmm12, %xmm2 -paddq %xmm4, %xmm5 -pmuludq %xmm15, %xmm11 -movaps 120(%rsp), %xmm1 -paddq %xmm10, %xmm5 -pmuludq %xmm8, %xmm14 -movaps 200(%rsp), %xmm10 -movaps 200(%rsp), %xmm4 -pmuludq %xmm12, %xmm1 -movaps 248(%rsp), %xmm8 -pmuludq %xmm15, %xmm10 -paddq %xmm11, %xmm2 -pmuludq %xmm12, %xmm4 -paddq %xmm14, %xmm5 -movaps 200(%rsp), %xmm11 -movaps 248(%rsp), %xmm14 -pmuludq %xmm15, %xmm8 -pmuludq 248(%rsp), %xmm12 -pmuludq %xmm9, %xmm11 -paddq %xmm10, %xmm1 -movaps 248(%rsp), %xmm10 -pmuludq 280(%rsp), %xmm15 -pmuludq %xmm3, %xmm14 -paddq %xmm15, %xmm12 -paddq %xmm8, %xmm4 -pmuludq %xmm13, %xmm10 -movq 24(%rax), %xmm15 -movaps 248(%rsp), %xmm8 -paddq %xmm11, %xmm2 -movaps %xmm3, %xmm11 -movaps 280(%rsp), %xmm3 -paddq %xmm14, %xmm5 -pmuludq %xmm9, %xmm8 -paddq %xmm10, %xmm2 -movq 16(%rax), %xmm14 -movaps 280(%rsp), %xmm10 -pmuludq %xmm9, %xmm3 -pmuludq 216(%rsp), %xmm9 -paddq %xmm9, %xmm12 -paddq %xmm8, %xmm1 -movq (%rax), %xmm8 -pmuludq %xmm11, %xmm10 -paddq %xmm3, %xmm4 -movaps 216(%rsp), %xmm3 -punpcklqdq %xmm14, %xmm8 -movaps 280(%rsp), %xmm14 -pmuludq %xmm13, %xmm3 -paddq %xmm10, %xmm2 -movq 8(%rax), %xmm10 -pmuludq %xmm13, %xmm14 -pmuludq 136(%rsp), %xmm13 -paddq %xmm13, %xmm12 -punpcklqdq %xmm15, %xmm10 -movaps %xmm10, %xmm9 -movaps 216(%rsp), %xmm15 -paddq %xmm3, %xmm4 -psllq $12, %xmm9 -movaps %xmm0, %xmm3 -paddq %xmm14, %xmm1 -pmuludq %xmm11, %xmm15 -pand %xmm8, %xmm3 -movaps 136(%rsp), %xmm14 -movaps %xmm3, -40(%rsp) -movaps %xmm8, %xmm3 -movdqu 48(%rax), %xmm13 -psrlq $52, %xmm8 -pmuludq %xmm11, %xmm14 -paddq %xmm15, %xmm1 -por %xmm9, %xmm8 -pmuludq 24(%rsp), %xmm11 -paddq %xmm11, %xmm12 -movdqu 32(%rax), %xmm11 -movaps %xmm10, %xmm9 -psrlq $40, %xmm10 -pand %xmm0, %xmm8 -movaps %xmm11, %xmm15 -paddq %xmm14, %xmm4 -xorps %xmm14, %xmm14 -punpckldq %xmm13, %xmm15 -psrlq $14, %xmm9 -addq $64, %rax -pand %xmm0, %xmm9 -psrlq $26, %xmm3 -cmpq $63, %rcx -por 312(%rsp), %xmm10 -movaps %xmm13, -72(%rsp) -movaps %xmm15, %xmm13 -punpckldq %xmm14, %xmm13 -punpckhdq -72(%rsp), %xmm11 -movaps %xmm13, -56(%rsp) -movaps %xmm11, %xmm13 -punpckhdq %xmm14, %xmm11 -pand %xmm0, %xmm3 -psllq $18, %xmm11 -punpckhdq %xmm14, %xmm15 -punpckldq %xmm14, %xmm13 -paddq %xmm11, %xmm4 -movaps -8(%rsp), %xmm11 -psllq $6, %xmm15 -psllq $12, %xmm13 -movaps 88(%rsp), %xmm14 -paddq %xmm15, %xmm2 -pmuludq %xmm10, %xmm11 -paddq %xmm13, %xmm1 -movaps -8(%rsp), %xmm13 -pmuludq %xmm10, %xmm14 -paddq -56(%rsp), %xmm5 -paddq 312(%rsp), %xmm12 -pmuludq %xmm9, %xmm13 -movaps 104(%rsp), %xmm15 -paddq %xmm11, %xmm2 -movaps 184(%rsp), %xmm11 -paddq %xmm14, %xmm5 -movaps 104(%rsp), %xmm14 -pmuludq %xmm9, %xmm15 -pmuludq %xmm10, %xmm11 -paddq %xmm13, %xmm5 -movaps 104(%rsp), %xmm13 -pmuludq %xmm10, %xmm14 -pmuludq 232(%rsp), %xmm10 -paddq %xmm10, %xmm12 -pmuludq %xmm8, %xmm13 -paddq %xmm15, %xmm2 -movaps %xmm8, %xmm10 -paddq %xmm11, %xmm4 -pmuludq %xmm7, %xmm10 -movaps 232(%rsp), %xmm11 -movaps 184(%rsp), %xmm15 -paddq %xmm14, %xmm1 -pmuludq %xmm9, %xmm11 -paddq %xmm13, %xmm5 -movaps 184(%rsp), %xmm13 -movaps 184(%rsp), %xmm14 -pmuludq %xmm3, %xmm15 -pmuludq %xmm9, %xmm13 -paddq %xmm11, %xmm4 -pmuludq %xmm8, %xmm14 -movaps 232(%rsp), %xmm11 -paddq %xmm10, %xmm4 -paddq %xmm15, %xmm5 -pmuludq %xmm7, %xmm9 -pmuludq %xmm8, %xmm11 -paddq %xmm13, %xmm1 -movaps 232(%rsp), %xmm13 -movaps 296(%rsp), %xmm10 -paddq %xmm14, %xmm2 -pmuludq 296(%rsp), %xmm8 -movaps -40(%rsp), %xmm14 -pmuludq %xmm3, %xmm13 -paddq %xmm9, %xmm12 -paddq %xmm11, %xmm1 -movaps %xmm3, %xmm11 -paddq %xmm8, %xmm12 -movaps 232(%rsp), %xmm15 -pmuludq %xmm7, %xmm11 -pmuludq %xmm3, %xmm10 -paddq %xmm13, %xmm2 -movaps %xmm14, %xmm13 -movaps 296(%rsp), %xmm9 -pmuludq %xmm14, %xmm15 -pmuludq 264(%rsp), %xmm3 -paddq %xmm11, %xmm1 -pmuludq %xmm7, %xmm13 -paddq %xmm3, %xmm12 -movaps 264(%rsp), %xmm11 -paddq %xmm10, %xmm4 -pmuludq %xmm14, %xmm9 -paddq %xmm15, %xmm5 -pmuludq %xmm14, %xmm11 -movaps %xmm5, %xmm8 -paddq %xmm13, %xmm2 -psrlq $26, %xmm8 -paddq %xmm9, %xmm1 -pand %xmm0, %xmm5 -pmuludq 152(%rsp), %xmm14 -paddq %xmm14, %xmm12 -paddq %xmm8, %xmm2 -paddq %xmm11, %xmm4 -movaps %xmm2, %xmm9 -movaps %xmm2, %xmm8 -movaps %xmm4, %xmm3 -psrlq $26, %xmm9 -pand %xmm0, %xmm4 -psrlq $26, %xmm3 -paddq %xmm9, %xmm1 -pand %xmm0, %xmm8 -paddq %xmm3, %xmm12 -movaps %xmm1, %xmm10 -movaps %xmm1, %xmm9 -movaps %xmm12, %xmm3 -psrlq $26, %xmm10 -pand %xmm0, %xmm12 -psrlq $26, %xmm3 -paddq %xmm10, %xmm4 -pand %xmm0, %xmm9 -pmuludq %xmm6, %xmm3 -movaps %xmm4, %xmm1 -movaps %xmm4, %xmm15 -psrlq $26, %xmm1 -pand %xmm0, %xmm15 -paddq %xmm1, %xmm12 -paddq %xmm3, %xmm5 -movaps %xmm5, %xmm2 -movaps %xmm5, %xmm3 -psrlq $26, %xmm2 -pand %xmm0, %xmm3 -paddq %xmm2, %xmm8 -ja poly1305_blocks_sse2_20 -leaq -64(%rdx), %rax -andl $63, %edx -andq $-64, %rax -leaq 64(%rsi,%rax), %rsi -poly1305_blocks_sse2_19: -cmpq $31, %rdx -jbe poly1305_blocks_sse2_21 -movaps 56(%rsp), %xmm11 -movaps %xmm15, %xmm1 -movaps %xmm15, %xmm14 -movaps 72(%rsp), %xmm5 -movaps %xmm12, %xmm4 -movaps %xmm15, %xmm10 -movaps 88(%rsp), %xmm2 -pmuludq %xmm11, %xmm14 -movaps %xmm8, %xmm15 -pmuludq %xmm5, %xmm1 -movaps 40(%rsp), %xmm13 -testq %rsi, %rsi -pmuludq %xmm12, %xmm2 -pmuludq %xmm12, %xmm5 -pmuludq %xmm11, %xmm4 -paddq %xmm1, %xmm2 -pmuludq %xmm9, %xmm11 -movaps %xmm12, %xmm1 -paddq %xmm14, %xmm5 -pmuludq %xmm13, %xmm15 -movaps %xmm9, %xmm14 -pmuludq %xmm13, %xmm14 -pmuludq %xmm13, %xmm1 -paddq %xmm11, %xmm2 -movaps 168(%rsp), %xmm11 -pmuludq %xmm10, %xmm13 -paddq %xmm15, %xmm2 -movaps %xmm9, %xmm15 -paddq %xmm14, %xmm5 -pmuludq %xmm11, %xmm12 -movaps %xmm3, %xmm14 -pmuludq %xmm11, %xmm14 -movaps %xmm13, 248(%rsp) -movaps %xmm10, %xmm13 -pmuludq %xmm7, %xmm15 -paddq 248(%rsp), %xmm4 -pmuludq %xmm11, %xmm13 -pmuludq %xmm7, %xmm10 -paddq %xmm14, %xmm2 -movaps %xmm13, 280(%rsp) -movaps %xmm8, %xmm13 -pmuludq %xmm11, %xmm13 -paddq %xmm10, %xmm12 -movaps 296(%rsp), %xmm10 -paddq 280(%rsp), %xmm1 -pmuludq %xmm9, %xmm11 -pmuludq 296(%rsp), %xmm9 -pmuludq %xmm3, %xmm10 -paddq %xmm9, %xmm12 -paddq %xmm13, %xmm5 -movaps %xmm3, %xmm13 -paddq %xmm15, %xmm1 -pmuludq %xmm7, %xmm13 -paddq %xmm11, %xmm4 -movaps 296(%rsp), %xmm11 -pmuludq %xmm8, %xmm7 -pmuludq %xmm8, %xmm11 -pmuludq 264(%rsp), %xmm8 -paddq %xmm8, %xmm12 -paddq %xmm13, %xmm5 -paddq %xmm7, %xmm4 -movaps 264(%rsp), %xmm7 -paddq %xmm11, %xmm1 -paddq %xmm10, %xmm4 -pmuludq %xmm3, %xmm7 -pmuludq 152(%rsp), %xmm3 -paddq %xmm3, %xmm12 -paddq %xmm7, %xmm1 -je poly1305_blocks_sse2_22 -movdqu (%rsi), %xmm7 -xorps %xmm3, %xmm3 -paddq 312(%rsp), %xmm12 -movdqu 16(%rsi), %xmm8 -movaps %xmm7, %xmm9 -punpckldq %xmm8, %xmm9 -punpckhdq %xmm8, %xmm7 -movaps %xmm9, %xmm10 -movaps %xmm7, %xmm8 -punpckldq %xmm3, %xmm10 -punpckhdq %xmm3, %xmm9 -punpckhdq %xmm3, %xmm7 -punpckldq %xmm3, %xmm8 -movaps %xmm8, %xmm3 -psllq $6, %xmm9 -paddq %xmm10, %xmm2 -psllq $12, %xmm3 -paddq %xmm9, %xmm5 -psllq $18, %xmm7 -paddq %xmm3, %xmm4 -paddq %xmm7, %xmm1 -poly1305_blocks_sse2_22: -movaps %xmm2, %xmm8 -movaps %xmm1, %xmm3 -movaps %xmm1, %xmm15 -psrlq $26, %xmm8 -pand %xmm0, %xmm2 -pand %xmm0, %xmm15 -psrlq $26, %xmm3 -paddq %xmm5, %xmm8 -paddq %xmm12, %xmm3 -movaps %xmm8, %xmm9 -pand %xmm0, %xmm8 -movaps %xmm3, %xmm1 -psrlq $26, %xmm9 -movaps %xmm3, %xmm12 -psrlq $26, %xmm1 -paddq %xmm4, %xmm9 -pand %xmm0, %xmm12 -pmuludq %xmm1, %xmm6 -movaps %xmm9, %xmm3 -pand %xmm0, %xmm9 -psrlq $26, %xmm3 -paddq %xmm3, %xmm15 -paddq %xmm6, %xmm2 -movaps %xmm15, %xmm3 -pand %xmm0, %xmm15 -movaps %xmm2, %xmm1 -psrlq $26, %xmm3 -psrlq $26, %xmm1 -paddq %xmm3, %xmm12 -movaps %xmm0, %xmm3 -paddq %xmm1, %xmm8 -pand %xmm2, %xmm3 -poly1305_blocks_sse2_21: -testq %rsi, %rsi -je poly1305_blocks_sse2_23 -pshufd $8, %xmm3, %xmm3 -pshufd $8, %xmm8, %xmm8 -pshufd $8, %xmm9, %xmm9 -pshufd $8, %xmm15, %xmm15 -pshufd $8, %xmm12, %xmm12 -punpcklqdq %xmm8, %xmm3 -punpcklqdq %xmm15, %xmm9 -movdqu %xmm3, (%rdi) -movdqu %xmm9, 16(%rdi) -movq %xmm12, 32(%rdi) -jmp poly1305_blocks_sse2_10 -poly1305_blocks_sse2_23: -movaps %xmm3, %xmm0 -movaps %xmm8, %xmm4 -movaps %xmm9, %xmm2 -psrldq $8, %xmm0 -movaps %xmm15, %xmm10 -paddq %xmm0, %xmm3 -psrldq $8, %xmm4 -movaps %xmm12, %xmm0 -movd %xmm3, %edx -paddq %xmm4, %xmm8 -psrldq $8, %xmm2 -movl %edx, %ecx -movd %xmm8, %eax -paddq %xmm2, %xmm9 -shrl $26, %ecx -psrldq $8, %xmm10 -andl $67108863, %edx -addl %ecx, %eax -movd %xmm9, %ecx -paddq %xmm10, %xmm15 -movl %eax, %r9d -shrl $26, %eax -psrldq $8, %xmm0 -addl %ecx, %eax -movd %xmm15, %ecx -paddq %xmm0, %xmm12 -movl %eax, %esi -andl $67108863, %r9d -movd %xmm12, %r10d -shrl $26, %esi -andl $67108863, %eax -addl %ecx, %esi -salq $8, %rax -movl %r9d, %ecx -shrl $18, %r9d -movl %esi, %r8d -shrl $26, %esi -andl $67108863, %r8d -addl %r10d, %esi -orq %r9, %rax -salq $16, %rsi -movq %r8, %r9 -shrl $10, %r8d -salq $26, %rcx -orq %r8, %rsi -salq $34, %r9 -orq %rdx, %rcx -movq %rsi, %r11 -shrq $42, %rsi -movabsq $17592186044415, %rdx -orq %r9, %rax -movabsq $4398046511103, %r8 -andq %rdx, %rcx -andq %rdx, %rax -andq %r8, %r11 -leaq (%rsi,%rsi,4), %rsi -addq %rsi, %rcx -movq %rcx, %r10 -shrq $44, %rcx -addq %rcx, %rax -andq %rdx, %r10 -movq %rax, %r9 -shrq $44, %rax -addq %r11, %rax -andq %rdx, %r9 -movabsq $-4398046511104, %r11 -movq %rax, %rcx -andq %r8, %rcx -shrq $42, %rax -leaq (%rax,%rax,4), %rsi -addq %rcx, %r11 -addq %r10, %rsi -movq %rsi, %r8 -shrq $44, %rsi -andq %rdx, %r8 -addq %r9, %rsi -leaq 5(%r8), %r9 -movq %r9, %rbx -andq %rdx, %r9 -shrq $44, %rbx -addq %rsi, %rbx -movq %rbx, %rax -andq %rbx, %rdx -shrq $44, %rax -addq %rax, %r11 -movq %r11, %rax -shrq $63, %rax -decq %rax -movq %rax, %r10 -andq %rax, %r9 -andq %rax, %rdx -notq %r10 -andq %r11, %rax -andq %r10, %r8 -andq %r10, %rsi -andq %r10, %rcx -orq %r9, %r8 -orq %rdx, %rsi -orq %rax, %rcx -movq %r8, (%rdi) -movq %rsi, 8(%rdi) -movq %rcx, 16(%rdi) -poly1305_blocks_sse2_10: -movq -8(%rbp), %rbx -leave -ret -FN_END poly1305_blocks_sse2 - -GLOBAL_HIDDEN_FN poly1305_finish_ext_sse2 -poly1305_finish_ext_sse2_local: -pushq %r12 -movq %rcx, %r12 -pushq %rbp -movq %rdx, %rbp -pushq %rbx -movq %rdi, %rbx -subq $32, %rsp -testq %rdx, %rdx -je poly1305_finish_ext_sse2_27 -xorl %eax, %eax -movq %rsp, %rdi -movl $8, %ecx -rep stosl -subq %rsp, %rsi -testb $16, %dl -movq %rsp, %rax -je poly1305_finish_ext_sse2_28 -movdqu (%rsp,%rsi), %xmm0 -addq $16, %rax -movaps %xmm0, (%rsp) -poly1305_finish_ext_sse2_28: -testb $8, %bpl -je poly1305_finish_ext_sse2_29 -movq (%rax,%rsi), %rdx -movq %rdx, (%rax) -addq $8, %rax -poly1305_finish_ext_sse2_29: -testb $4, %bpl -je poly1305_finish_ext_sse2_30 -movl (%rax,%rsi), %edx -movl %edx, (%rax) -addq $4, %rax -poly1305_finish_ext_sse2_30: -testb $2, %bpl -je poly1305_finish_ext_sse2_31 -movw (%rax,%rsi), %dx -movw %dx, (%rax) -addq $2, %rax -poly1305_finish_ext_sse2_31: -testb $1, %bpl -je poly1305_finish_ext_sse2_32 -movb (%rax,%rsi), %dl -movb %dl, (%rax) -poly1305_finish_ext_sse2_32: -cmpq $16, %rbp -je poly1305_finish_ext_sse2_33 -movb $1, (%rsp,%rbp) -poly1305_finish_ext_sse2_33: -cmpq $16, %rbp -movl $32, %edx -movq %rsp, %rsi -sbbq %rax, %rax -movq %rbx, %rdi -andl $4, %eax -addq $4, %rax -orq %rax, 120(%rbx) -call poly1305_blocks_sse2_local -poly1305_finish_ext_sse2_27: -movq 120(%rbx), %rax -testb $1, %al -je poly1305_finish_ext_sse2_35 -decq %rbp -cmpq $15, %rbp -jbe poly1305_finish_ext_sse2_36 -orq $16, %rax -jmp poly1305_finish_ext_sse2_40 -poly1305_finish_ext_sse2_36: -orq $32, %rax -poly1305_finish_ext_sse2_40: -movq %rax, 120(%rbx) -movl $32, %edx -xorl %esi, %esi -movq %rbx, %rdi -call poly1305_blocks_sse2_local -poly1305_finish_ext_sse2_35: -movq 8(%rbx), %rax -movq 112(%rbx), %rsi -movq %rax, %rdx -movq %rax, %rcx -movq 16(%rbx), %rax -shrq $20, %rcx -salq $44, %rdx -orq (%rbx), %rdx -salq $24, %rax -orq %rcx, %rax -movq 104(%rbx), %rcx -addq %rcx, %rdx -adcq %rsi, %rax -xorps %xmm0, %xmm0 -movdqu %xmm0, (%rbx) -movdqu %xmm0, 16(%rbx) -movdqu %xmm0, 32(%rbx) -movdqu %xmm0, 48(%rbx) -movdqu %xmm0, 64(%rbx) -movdqu %xmm0, 80(%rbx) -movdqu %xmm0, 96(%rbx) -movdqu %xmm0, 112(%rbx) -movq %rdx, (%r12) -movq %rax, 8(%r12) -addq $32, %rsp -popq %rbx -popq %rbp -popq %r12 -ret -FN_END poly1305_finish_ext_sse2 - -GLOBAL_HIDDEN_FN poly1305_auth_sse2 -/* -cmpq $128, %rdx -jb poly1305_auth_x86_local -*/ -pushq %rbp -movq %rsp, %rbp -pushq %r14 -pushq %r13 -movq %rdi, %r13 -pushq %r12 -movq %rsi, %r12 -movq %rcx, %rsi -pushq %rbx -movq %rdx, %rbx -andq $-64, %rsp -movq %rbx, %r14 -addq $-128, %rsp -movq %rsp, %rdi -call poly1305_init_ext_sse2_local -andq $-32, %r14 -je poly1305_auth_sse2_42 -movq %r12, %rsi -movq %r14, %rdx -movq %rsp, %rdi -call poly1305_blocks_sse2_local -addq %r14, %r12 -subq %r14, %rbx -poly1305_auth_sse2_42: -movq %r13, %rcx -movq %rbx, %rdx -movq %r12, %rsi -movq %rsp, %rdi -call poly1305_finish_ext_sse2_local -leaq -32(%rbp), %rsp -popq %rbx -popq %r12 -popq %r13 -popq %r14 -popq %rbp -ret -FN_END poly1305_auth_sse2 - - - - - |