aboutsummaryrefslogtreecommitdiffstats
path: root/src/libcryptobox/poly1305/avx2.S
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-07 22:10:07 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-07 22:28:36 +0000
commit06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f (patch)
treea9d9d28c941840486c6697362dac219204672fda /src/libcryptobox/poly1305/avx2.S
parent1e2ff82baa69251c79576609c2a94bd0c006cd72 (diff)
downloadrspamd-06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f.tar.gz
rspamd-06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f.zip
Use optimized version of poly1305.
Diffstat (limited to 'src/libcryptobox/poly1305/avx2.S')
-rw-r--r--src/libcryptobox/poly1305/avx2.S1093
1 files changed, 1093 insertions, 0 deletions
diff --git a/src/libcryptobox/poly1305/avx2.S b/src/libcryptobox/poly1305/avx2.S
new file mode 100644
index 000000000..068e24d3d
--- /dev/null
+++ b/src/libcryptobox/poly1305/avx2.S
@@ -0,0 +1,1093 @@
+#include "../chacha20/macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0
+movl $64, %eax
+ret
+FN_END poly1305_block_size_avx2
+
+GLOBAL_HIDDEN_FN poly1305_auth_avx2
+cmp $128, %rdx
+jb poly1305_auth_x86_local
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+pushq %r12
+pushq %r14
+pushq %r15
+pushq %rbx
+subq $224, %rsp
+movq %rsi, %r14
+movq %rdi, %rbx
+lea (%rsp), %rdi
+movq %rcx, %rsi
+movq %rdx, %r12
+call poly1305_init_ext_avx2_local
+poly1305_auth_avx2_2:
+movq %r12, %r15
+andq $-64, %r15
+je poly1305_auth_avx2_5
+poly1305_auth_avx2_3:
+movq %r14, %rsi
+lea (%rsp), %rdi
+movq %r15, %rdx
+call poly1305_blocks_avx2_local
+poly1305_auth_avx2_4:
+addq %r15, %r14
+subq %r15, %r12
+poly1305_auth_avx2_5:
+movq %r14, %rsi
+lea (%rsp), %rdi
+movq %r12, %rdx
+movq %rbx, %rcx
+call poly1305_finish_ext_avx2_local
+poly1305_auth_avx2_6:
+addq $224, %rsp
+popq %rbx
+popq %r15
+popq %r14
+popq %r12
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_auth_avx2
+
+
+GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2
+poly1305_finish_ext_avx2_local:
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+pushq %r12
+pushq %r13
+pushq %r14
+subq $104, %rsp
+movq %rdx, %r13
+movq %rcx, %r14
+movq %rdi, %r12
+testq %r13, %r13
+je poly1305_finish_ext_avx2_29
+poly1305_finish_ext_avx2_2:
+lea (%rsp), %rax
+vpxor %ymm0, %ymm0, %ymm0
+subq %rax, %rsi
+vmovdqu %ymm0, (%rsp)
+vmovdqu %ymm0, 32(%rsp)
+testq $32, %r13
+je poly1305_finish_ext_avx2_4
+poly1305_finish_ext_avx2_3:
+vmovdqu (%rsp,%rsi), %ymm0
+lea 32(%rsp), %rax
+vmovdqu %ymm0, (%rsp)
+poly1305_finish_ext_avx2_4:
+testq $16, %r13
+je poly1305_finish_ext_avx2_6
+poly1305_finish_ext_avx2_5:
+vmovdqu (%rax,%rsi), %xmm0
+vmovdqu %xmm0, (%rax)
+addq $16, %rax
+poly1305_finish_ext_avx2_6:
+testq $8, %r13
+je poly1305_finish_ext_avx2_8
+poly1305_finish_ext_avx2_7:
+movq (%rax,%rsi), %rdx
+movq %rdx, (%rax)
+addq $8, %rax
+poly1305_finish_ext_avx2_8:
+testq $4, %r13
+je poly1305_finish_ext_avx2_10
+poly1305_finish_ext_avx2_9:
+movl (%rax,%rsi), %edx
+movl %edx, (%rax)
+addq $4, %rax
+poly1305_finish_ext_avx2_10:
+testq $2, %r13
+je poly1305_finish_ext_avx2_12
+poly1305_finish_ext_avx2_11:
+movzwl (%rax,%rsi), %edx
+movw %dx, (%rax)
+addq $2, %rax
+poly1305_finish_ext_avx2_12:
+testq $1, %r13
+je poly1305_finish_ext_avx2_14
+poly1305_finish_ext_avx2_13:
+movb (%rax,%rsi), %dl
+movb %dl, (%rax)
+poly1305_finish_ext_avx2_14:
+testq $15, %r13
+je poly1305_finish_ext_avx2_16
+poly1305_finish_ext_avx2_15:
+movb $1, (%rsp,%r13)
+poly1305_finish_ext_avx2_16:
+movq 176(%r12), %rdx
+andq $-8125, %rdx
+cmpq $48, %r13
+jb poly1305_finish_ext_avx2_18
+poly1305_finish_ext_avx2_17:
+orq $4, %rdx
+jmp poly1305_finish_ext_avx2_21
+poly1305_finish_ext_avx2_18:
+cmpq $32, %r13
+jb poly1305_finish_ext_avx2_20
+poly1305_finish_ext_avx2_19:
+orq $8, %rdx
+jmp poly1305_finish_ext_avx2_21
+poly1305_finish_ext_avx2_20:
+movq %rdx, %rax
+orq $32, %rdx
+orq $16, %rax
+cmpq $16, %r13
+cmovae %rax, %rdx
+poly1305_finish_ext_avx2_21:
+testq $1, %rdx
+je poly1305_finish_ext_avx2_27
+poly1305_finish_ext_avx2_22:
+cmpq $16, %r13
+ja poly1305_finish_ext_avx2_24
+poly1305_finish_ext_avx2_23:
+orq $256, %rdx
+movq %rdx, 176(%r12)
+jmp poly1305_finish_ext_avx2_28
+poly1305_finish_ext_avx2_24:
+cmpq $32, %r13
+ja poly1305_finish_ext_avx2_27
+poly1305_finish_ext_avx2_25:
+orq $128, %rdx
+movq %rdx, 176(%r12)
+jmp poly1305_finish_ext_avx2_28
+poly1305_finish_ext_avx2_27:
+movq %rdx, 176(%r12)
+poly1305_finish_ext_avx2_28:
+movq %r12, %rdi
+lea (%rsp), %rsi
+movl $64, %edx
+vzeroupper
+call poly1305_blocks_avx2_local
+poly1305_finish_ext_avx2_29:
+movq 176(%r12), %rdx
+testq $1, %rdx
+je poly1305_finish_ext_avx2_37
+poly1305_finish_ext_avx2_30:
+andq $-8125, %rdx
+testq %r13, %r13
+je poly1305_finish_ext_avx2_32
+poly1305_finish_ext_avx2_31:
+cmpq $48, %r13
+jbe poly1305_finish_ext_avx2_33
+poly1305_finish_ext_avx2_32:
+orq $512, %rdx
+jmp poly1305_finish_ext_avx2_36
+poly1305_finish_ext_avx2_33:
+cmpq $32, %r13
+jbe poly1305_finish_ext_avx2_35
+poly1305_finish_ext_avx2_34:
+orq $1024, %rdx
+jmp poly1305_finish_ext_avx2_36
+poly1305_finish_ext_avx2_35:
+movq %rdx, %rax
+orq $4096, %rdx
+orq $2048, %rax
+cmpq $16, %r13
+cmova %rax, %rdx
+poly1305_finish_ext_avx2_36:
+orq $96, %rdx
+movq %r12, %rdi
+vpxor %ymm0, %ymm0, %ymm0
+lea (%rsp), %rsi
+movq %rdx, 176(%r12)
+movl $64, %edx
+vmovdqu %ymm0, (%rsp)
+vmovdqu %ymm0, 32(%rsp)
+vzeroupper
+call poly1305_blocks_avx2_local
+poly1305_finish_ext_avx2_37:
+movq 8(%r12), %r8
+movq %r8, %rsi
+movq 16(%r12), %rax
+vpxor %ymm0, %ymm0, %ymm0
+shlq $44, %rsi
+shrq $20, %r8
+shlq $24, %rax
+orq (%r12), %rsi
+orq %rax, %r8
+movq 160(%r12), %rdx
+movq 168(%r12), %rcx
+addq %rdx, %rsi
+adcq %rcx, %r8
+vmovdqu %ymm0, (%r12)
+vmovdqu %ymm0, 32(%r12)
+vmovdqu %ymm0, 64(%r12)
+vmovdqu %ymm0, 96(%r12)
+vmovdqu %ymm0, 128(%r12)
+vmovdqu %ymm0, 160(%r12)
+movq %rsi, (%r14)
+movq %r8, 8(%r14)
+vzeroupper
+addq $104, %rsp
+popq %r14
+popq %r13
+popq %r12
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_finish_ext_avx2
+
+GLOBAL_HIDDEN_FN poly1305_blocks_avx2
+poly1305_blocks_avx2_local:
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+subq $384, %rsp
+movl $16777216, %eax
+movl $67108863, %ecx
+movl $5, %r8d
+vmovd %eax, %xmm1
+vmovd %ecx, %xmm10
+vmovd %r8d, %xmm0
+movq 176(%rdi), %rax
+vpbroadcastq %xmm1, %ymm1
+vpbroadcastq %xmm10, %ymm10
+vpbroadcastq %xmm0, %ymm11
+testq $60, %rax
+je poly1305_blocks_avx2_11
+poly1305_blocks_avx2_2:
+vpsrldq $8, %ymm1, %ymm15
+testq $4, %rax
+je poly1305_blocks_avx2_4
+poly1305_blocks_avx2_3:
+vpermq $192, %ymm15, %ymm15
+poly1305_blocks_avx2_4:
+testq $8, %rax
+je poly1305_blocks_avx2_6
+poly1305_blocks_avx2_5:
+vpermq $240, %ymm15, %ymm15
+poly1305_blocks_avx2_6:
+testq $16, %rax
+je poly1305_blocks_avx2_8
+poly1305_blocks_avx2_7:
+vpermq $252, %ymm15, %ymm15
+poly1305_blocks_avx2_8:
+testq $32, %rax
+je poly1305_blocks_avx2_10
+poly1305_blocks_avx2_9:
+vpxor %ymm15, %ymm15, %ymm15
+poly1305_blocks_avx2_10:
+vmovdqa %ymm15, %ymm1
+poly1305_blocks_avx2_11:
+movq %rax, %rcx
+btsq $0, %rcx
+jc poly1305_blocks_avx2_13
+poly1305_blocks_avx2_12:
+vmovdqu (%rsi), %ymm3
+movq %rcx, %rax
+vmovdqu 32(%rsi), %ymm5
+vpunpcklqdq %ymm5, %ymm3, %ymm4
+addq $64, %rsi
+vpunpckhqdq %ymm5, %ymm3, %ymm7
+vpermq $216, %ymm4, %ymm6
+addq $-64, %rdx
+vpermq $216, %ymm7, %ymm0
+vpsrlq $52, %ymm6, %ymm8
+vpsllq $12, %ymm0, %ymm9
+vpsrlq $26, %ymm6, %ymm2
+vpsrlq $40, %ymm0, %ymm0
+vpand %ymm6, %ymm10, %ymm4
+vpor %ymm9, %ymm8, %ymm7
+vpand %ymm2, %ymm10, %ymm3
+vpor %ymm1, %ymm0, %ymm9
+vpsrlq $26, %ymm7, %ymm2
+vpand %ymm7, %ymm10, %ymm5
+vpand %ymm2, %ymm10, %ymm7
+movq %rax, 176(%rdi)
+jmp poly1305_blocks_avx2_14
+poly1305_blocks_avx2_13:
+vpermq $216, (%rdi), %ymm15
+vpxor %ymm0, %ymm0, %ymm0
+vpermq $216, 32(%rdi), %ymm14
+vpermq $216, 64(%rdi), %ymm13
+vpunpckldq %ymm0, %ymm15, %ymm4
+vpunpckhdq %ymm0, %ymm15, %ymm3
+vpunpckldq %ymm0, %ymm14, %ymm5
+vpunpckhdq %ymm0, %ymm14, %ymm7
+vpunpckldq %ymm0, %ymm13, %ymm9
+poly1305_blocks_avx2_14:
+cmpq $64, %rdx
+jb poly1305_blocks_avx2_34
+poly1305_blocks_avx2_15:
+vmovdqu 140(%rdi), %ymm0
+testq $8064, %rax
+je poly1305_blocks_avx2_29
+poly1305_blocks_avx2_16:
+vpermq $216, 80(%rdi), %ymm6
+vpermq $216, 100(%rdi), %ymm2
+vpermq $216, 120(%rdi), %ymm8
+vpermq $216, %ymm0, %ymm0
+testq $128, %rax
+je poly1305_blocks_avx2_18
+poly1305_blocks_avx2_17:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm0, %ymm14
+vmovdqa %ymm0, %ymm13
+vmovdqa %ymm8, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_18:
+testq $256, %rax
+je poly1305_blocks_avx2_20
+poly1305_blocks_avx2_19:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm0, %ymm14
+vmovdqa %ymm8, %ymm13
+vmovdqa %ymm2, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_20:
+testq $512, %rax
+je poly1305_blocks_avx2_22
+poly1305_blocks_avx2_21:
+vmovdqa %ymm0, %ymm15
+vmovdqa %ymm8, %ymm14
+vmovdqa %ymm2, %ymm13
+vmovdqa %ymm6, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_22:
+testq $1024, %rax
+je poly1305_blocks_avx2_24
+poly1305_blocks_avx2_23:
+vpxor %ymm12, %ymm12, %ymm12
+movl $1, %r8d
+vmovdqa %ymm8, %ymm15
+vmovdqa %ymm2, %ymm14
+vmovdqa %ymm6, %ymm13
+vmovd %r8d, %xmm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_24:
+testq $2048, %rax
+je poly1305_blocks_avx2_26
+poly1305_blocks_avx2_25:
+vpxor %ymm12, %ymm12, %ymm12
+movl $1, %r8d
+vmovd %r8d, %xmm13
+vmovdqa %ymm2, %ymm15
+vmovdqa %ymm6, %ymm14
+vmovdqa %ymm13, %ymm12
+jmp poly1305_blocks_avx2_28
+poly1305_blocks_avx2_26:
+testq $4096, %rax
+je poly1305_blocks_avx2_28
+poly1305_blocks_avx2_27:
+movl $1, %r8d
+vmovd %r8d, %xmm14
+vmovdqa %ymm6, %ymm15
+vmovdqa %ymm14, %ymm13
+vmovdqa %ymm14, %ymm12
+poly1305_blocks_avx2_28:
+vpunpcklqdq %ymm14, %ymm15, %ymm6
+vpunpcklqdq %ymm12, %ymm13, %ymm8
+vpunpckhqdq %ymm14, %ymm15, %ymm14
+vpunpckhqdq %ymm12, %ymm13, %ymm12
+vperm2i128 $32, %ymm8, %ymm6, %ymm2
+vperm2i128 $49, %ymm8, %ymm6, %ymm6
+vpsrlq $32, %ymm6, %ymm0
+vpsrlq $32, %ymm2, %ymm8
+vmovdqu %ymm0, 352(%rsp)
+vperm2i128 $32, %ymm12, %ymm14, %ymm13
+vmovdqu %ymm13, 320(%rsp)
+jmp poly1305_blocks_avx2_30
+poly1305_blocks_avx2_29:
+vpsrlq $32, %ymm0, %ymm12
+vpermq $0, %ymm0, %ymm2
+vpermq $85, %ymm0, %ymm6
+vpermq $85, %ymm12, %ymm13
+vpermq $170, %ymm0, %ymm0
+vpermq $0, %ymm12, %ymm8
+vmovdqu %ymm13, 352(%rsp)
+vmovdqu %ymm0, 320(%rsp)
+poly1305_blocks_avx2_30:
+vmovdqu (%rsi), %ymm12
+movq %rdx, %r9
+vmovdqu 352(%rsp), %ymm15
+vmovdqu %ymm1, 160(%rsp)
+vmovdqu %ymm10, 192(%rsp)
+vmovdqu %ymm11, 128(%rsp)
+vperm2i128 $32, 32(%rsi), %ymm12, %ymm13
+xorl %r8d, %r8d
+vperm2i128 $49, 32(%rsi), %ymm12, %ymm12
+xorl %ecx, %ecx
+vpmuludq %ymm11, %ymm8, %ymm0
+vpmuludq %ymm11, %ymm6, %ymm1
+vmovdqu %ymm0, 224(%rsp)
+vmovdqu %ymm1, 256(%rsp)
+vpunpckldq %ymm12, %ymm13, %ymm14
+vpunpckhdq %ymm12, %ymm13, %ymm12
+vmovdqu %ymm14, 32(%rsp)
+vpmuludq %ymm0, %ymm9, %ymm0
+vpmuludq %ymm1, %ymm7, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vpmuludq %ymm11, %ymm15, %ymm10
+vpmuludq %ymm10, %ymm5, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vmovdqu %ymm10, 288(%rsp)
+vpmuludq 320(%rsp), %ymm11, %ymm11
+vpmuludq %ymm11, %ymm3, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vmovdqu %ymm11, (%rsp)
+vpmuludq %ymm2, %ymm4, %ymm13
+vpaddq %ymm13, %ymm0, %ymm0
+vpxor %ymm13, %ymm13, %ymm13
+vpunpckldq %ymm13, %ymm14, %ymm14
+vpaddq %ymm14, %ymm0, %ymm0
+vmovdqu %ymm0, 64(%rsp)
+vpmuludq %ymm11, %ymm9, %ymm14
+vpmuludq %ymm2, %ymm7, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm8, %ymm5, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm6, %ymm3, %ymm0
+vpaddq %ymm0, %ymm14, %ymm14
+vpmuludq %ymm15, %ymm4, %ymm0
+vpaddq %ymm0, %ymm14, %ymm0
+vpunpckhdq %ymm13, %ymm12, %ymm14
+vpsllq $18, %ymm14, %ymm14
+vpaddq %ymm14, %ymm0, %ymm14
+vpmuludq %ymm1, %ymm9, %ymm1
+vpmuludq %ymm10, %ymm7, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm11, %ymm5, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm2, %ymm3, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm8, %ymm4, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu 32(%rsp), %ymm0
+vpunpckhdq %ymm13, %ymm0, %ymm0
+vpsllq $6, %ymm0, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu 64(%rsp), %ymm0
+vpsrlq $26, %ymm0, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vmovdqu %ymm1, 96(%rsp)
+vpmuludq %ymm2, %ymm9, %ymm1
+vpmuludq %ymm8, %ymm7, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm10, %ymm9, %ymm10
+vpmuludq %ymm11, %ymm7, %ymm11
+vpaddq %ymm11, %ymm10, %ymm7
+vpmuludq %ymm6, %ymm5, %ymm0
+vpaddq %ymm0, %ymm1, %ymm1
+vpmuludq %ymm2, %ymm5, %ymm5
+vpaddq %ymm5, %ymm7, %ymm10
+vpmuludq %ymm15, %ymm3, %ymm15
+vpaddq %ymm15, %ymm1, %ymm1
+vpmuludq %ymm8, %ymm3, %ymm11
+vpaddq %ymm11, %ymm10, %ymm5
+vpunpckldq %ymm13, %ymm12, %ymm10
+vmovdqu 96(%rsp), %ymm12
+vpmuludq 320(%rsp), %ymm4, %ymm0
+vpaddq %ymm0, %ymm1, %ymm15
+vpsrlq $26, %ymm12, %ymm3
+vmovdqu 160(%rsp), %ymm1
+vpmuludq %ymm6, %ymm4, %ymm4
+vpaddq %ymm1, %ymm15, %ymm0
+vpsrlq $26, %ymm14, %ymm15
+vpaddq %ymm4, %ymm5, %ymm11
+vpsllq $12, %ymm10, %ymm4
+vmovdqu 192(%rsp), %ymm10
+vpaddq %ymm15, %ymm0, %ymm0
+vpaddq %ymm4, %ymm11, %ymm5
+vmovdqu 128(%rsp), %ymm11
+vpsrlq $26, %ymm0, %ymm9
+vpaddq %ymm3, %ymm5, %ymm7
+vpand 64(%rsp), %ymm10, %ymm13
+vpand %ymm10, %ymm12, %ymm12
+vpand %ymm10, %ymm7, %ymm5
+vpsrlq $26, %ymm7, %ymm7
+vpmuludq %ymm11, %ymm9, %ymm15
+vpand %ymm10, %ymm14, %ymm9
+vpaddq %ymm15, %ymm13, %ymm3
+vpand %ymm10, %ymm0, %ymm14
+vpaddq %ymm7, %ymm9, %ymm9
+vpand %ymm10, %ymm3, %ymm4
+vpsrlq $26, %ymm3, %ymm3
+vpsrlq $26, %ymm9, %ymm0
+vpand %ymm10, %ymm9, %ymm7
+vpaddq %ymm3, %ymm12, %ymm3
+vpaddq %ymm0, %ymm14, %ymm9
+sarq $5, %r9
+shrq $58, %r9
+addq %rdx, %r9
+sarq $6, %r9
+cmpq $2, %r9
+jl poly1305_blocks_avx2_34
+poly1305_blocks_avx2_31:
+vmovdqu %ymm6, 32(%rsp)
+lea -64(%rdx), %r9
+vmovdqu %ymm8, 64(%rsp)
+vmovdqu %ymm11, 128(%rsp)
+vmovdqu %ymm10, 192(%rsp)
+vmovdqu %ymm1, 160(%rsp)
+vmovdqu (%rsp), %ymm12
+sarq $5, %r9
+shrq $58, %r9
+lea -64(%rdx,%r9), %rdx
+sarq $6, %rdx
+poly1305_blocks_avx2_32:
+vmovdqu 256(%rsp), %ymm15
+incq %r8
+vmovdqu 64(%rcx,%rsi), %ymm11
+vpmuludq 224(%rsp), %ymm9, %ymm8
+vpmuludq %ymm15, %ymm7, %ymm14
+vpaddq %ymm14, %ymm8, %ymm1
+vmovdqu 288(%rsp), %ymm8
+vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10
+vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6
+addq $64, %rcx
+vpmuludq %ymm8, %ymm5, %ymm13
+vpunpckldq %ymm6, %ymm10, %ymm0
+vpunpckhdq %ymm6, %ymm10, %ymm11
+vpaddq %ymm13, %ymm1, %ymm10
+vpmuludq %ymm12, %ymm3, %ymm6
+vpaddq %ymm6, %ymm10, %ymm14
+vpxor %ymm10, %ymm10, %ymm10
+vpunpckldq %ymm10, %ymm0, %ymm6
+vpunpckhdq %ymm10, %ymm0, %ymm0
+vpmuludq %ymm2, %ymm4, %ymm1
+vpaddq %ymm1, %ymm14, %ymm13
+vpaddq %ymm6, %ymm13, %ymm1
+vmovdqu 64(%rsp), %ymm6
+vmovdqu %ymm1, (%rsp)
+vpsrlq $26, %ymm1, %ymm1
+vpmuludq %ymm12, %ymm9, %ymm14
+vpmuludq %ymm2, %ymm7, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq %ymm6, %ymm5, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq 32(%rsp), %ymm3, %ymm13
+vpaddq %ymm13, %ymm14, %ymm14
+vpmuludq 352(%rsp), %ymm4, %ymm13
+vpaddq %ymm13, %ymm14, %ymm13
+vpunpckhdq %ymm10, %ymm11, %ymm14
+vpsllq $18, %ymm14, %ymm14
+vpaddq %ymm14, %ymm13, %ymm13
+vpmuludq %ymm15, %ymm9, %ymm15
+vpmuludq %ymm8, %ymm7, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm12, %ymm5, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm2, %ymm3, %ymm14
+vpaddq %ymm14, %ymm15, %ymm15
+vpmuludq %ymm6, %ymm4, %ymm14
+vpaddq %ymm14, %ymm15, %ymm14
+vpsllq $6, %ymm0, %ymm15
+vpaddq %ymm15, %ymm14, %ymm14
+vmovdqu 32(%rsp), %ymm15
+vpaddq %ymm1, %ymm14, %ymm1
+vpmuludq %ymm2, %ymm9, %ymm0
+vpmuludq %ymm6, %ymm7, %ymm14
+vpmuludq %ymm8, %ymm9, %ymm9
+vpmuludq %ymm12, %ymm7, %ymm7
+vpaddq %ymm7, %ymm9, %ymm7
+vpaddq %ymm14, %ymm0, %ymm0
+vpsrlq $26, %ymm1, %ymm9
+vpmuludq %ymm15, %ymm5, %ymm14
+vpmuludq %ymm2, %ymm5, %ymm5
+vpaddq %ymm5, %ymm7, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpmuludq 352(%rsp), %ymm3, %ymm14
+vpmuludq %ymm6, %ymm3, %ymm3
+vpaddq %ymm3, %ymm5, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpmuludq 320(%rsp), %ymm4, %ymm14
+vpmuludq %ymm15, %ymm4, %ymm4
+vpaddq %ymm4, %ymm5, %ymm5
+vpaddq %ymm14, %ymm0, %ymm0
+vpunpckldq %ymm10, %ymm11, %ymm4
+vpaddq 160(%rsp), %ymm0, %ymm14
+vpsrlq $26, %ymm13, %ymm0
+vpsllq $12, %ymm4, %ymm3
+vpaddq %ymm0, %ymm14, %ymm14
+vpaddq %ymm3, %ymm5, %ymm7
+vpsrlq $26, %ymm14, %ymm0
+vpaddq %ymm9, %ymm7, %ymm10
+vmovdqu 192(%rsp), %ymm9
+vpsrlq $26, %ymm10, %ymm11
+vpand (%rsp), %ymm9, %ymm6
+vpand %ymm9, %ymm13, %ymm13
+vpand %ymm9, %ymm1, %ymm1
+vpand %ymm9, %ymm14, %ymm14
+vpand %ymm9, %ymm10, %ymm5
+vpmuludq 128(%rsp), %ymm0, %ymm8
+vpaddq %ymm8, %ymm6, %ymm15
+vpaddq %ymm11, %ymm13, %ymm0
+vpsrlq $26, %ymm15, %ymm3
+vpand %ymm9, %ymm0, %ymm7
+vpsrlq $26, %ymm0, %ymm0
+vpand %ymm9, %ymm15, %ymm4
+vpaddq %ymm3, %ymm1, %ymm3
+vpaddq %ymm0, %ymm14, %ymm9
+cmpq %rdx, %r8
+jb poly1305_blocks_avx2_32
+poly1305_blocks_avx2_34:
+testq $64, %rax
+jne poly1305_blocks_avx2_36
+poly1305_blocks_avx2_35:
+vpshufd $8, %ymm4, %ymm0
+vpshufd $8, %ymm3, %ymm3
+vpshufd $8, %ymm5, %ymm5
+vpshufd $8, %ymm7, %ymm7
+vpshufd $8, %ymm9, %ymm9
+vpermq $8, %ymm0, %ymm1
+vpermq $8, %ymm3, %ymm2
+vpermq $8, %ymm5, %ymm4
+vpermq $8, %ymm7, %ymm6
+vpermq $8, %ymm9, %ymm11
+vperm2i128 $32, %ymm2, %ymm1, %ymm8
+vperm2i128 $32, %ymm6, %ymm4, %ymm10
+vmovdqu %ymm8, (%rdi)
+vmovdqu %ymm10, 32(%rdi)
+vmovdqu %xmm11, 64(%rdi)
+jmp poly1305_blocks_avx2_37
+poly1305_blocks_avx2_36:
+vpermq $245, %ymm4, %ymm0
+vpaddq %ymm0, %ymm4, %ymm4
+vpermq $245, %ymm3, %ymm1
+vpaddq %ymm1, %ymm3, %ymm10
+vpermq $245, %ymm5, %ymm3
+vpermq $170, %ymm4, %ymm6
+vpaddq %ymm3, %ymm5, %ymm13
+vpaddq %ymm6, %ymm4, %ymm8
+vpermq $170, %ymm10, %ymm11
+vpermq $245, %ymm7, %ymm5
+vpaddq %ymm11, %ymm10, %ymm12
+vpaddq %ymm5, %ymm7, %ymm7
+vpermq $170, %ymm13, %ymm14
+vpermq $245, %ymm9, %ymm2
+vpaddq %ymm14, %ymm13, %ymm15
+vpaddq %ymm2, %ymm9, %ymm9
+vpermq $170, %ymm7, %ymm0
+vpaddq %ymm0, %ymm7, %ymm1
+vpermq $170, %ymm9, %ymm2
+vpaddq %ymm2, %ymm9, %ymm3
+vmovd %xmm8, %r9d
+movl %r9d, %r8d
+shrl $26, %r8d
+andq $67108863, %r9
+vmovd %xmm12, %esi
+addl %r8d, %esi
+movl %esi, %r11d
+shrl $26, %esi
+andq $67108863, %r11
+vmovd %xmm15, %ecx
+addl %esi, %ecx
+movl %ecx, %eax
+shrl $26, %eax
+andq $67108863, %rcx
+shlq $8, %rcx
+vmovd %xmm1, %r8d
+addl %eax, %r8d
+movl %r8d, %r10d
+shrl $26, %r8d
+andq $67108863, %r10
+movq %r10, %rax
+shrq $10, %rax
+shlq $34, %r10
+vmovd %xmm3, %edx
+addl %r8d, %edx
+shlq $16, %rdx
+orq %rdx, %rax
+movq %rax, %r8
+shrq $42, %r8
+lea (%r8,%r8,4), %rdx
+movq %r11, %r8
+shlq $26, %r8
+orq %r8, %r9
+movq $0xfffffffffff, %r8
+shrq $18, %r11
+andq %r8, %r9
+addq %r9, %rdx
+orq %rcx, %r11
+movq %rdx, %rsi
+orq %r10, %r11
+shrq $44, %rsi
+andq %r8, %r11
+addq %r11, %rsi
+movq $0x3ffffffffff, %r9
+movq %rsi, %r10
+andq %r9, %rax
+shrq $44, %r10
+andq %r8, %rdx
+addq %r10, %rax
+movq %r8, %rcx
+andq %rax, %r9
+andq %r8, %rsi
+shrq $42, %rax
+movq $0xfffffc0000000000, %r10
+lea (%rax,%rax,4), %r11
+addq %r11, %rdx
+andq %rdx, %rcx
+shrq $44, %rdx
+addq %rdx, %rsi
+lea 5(%rcx), %rdx
+movq %rdx, %r11
+andq %r8, %rdx
+shrq $44, %r11
+addq %rsi, %r11
+movq %r11, %rax
+andq %r11, %r8
+shrq $44, %rax
+addq %r9, %rax
+addq %r10, %rax
+movq %rax, %r10
+shrq $63, %r10
+decq %r10
+andn %rcx, %r10, %rcx
+andq %r10, %rdx
+orq %rdx, %rcx
+andq %r10, %r8
+andn %rsi, %r10, %rdx
+andq %r10, %rax
+andn %r9, %r10, %rsi
+orq %r8, %rdx
+orq %rax, %rsi
+movq %rcx, (%rdi)
+movq %rdx, 8(%rdi)
+movq %rsi, 16(%rdi)
+poly1305_blocks_avx2_37:
+vzeroupper
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END poly1305_blocks_avx2
+
+GLOBAL_HIDDEN_FN poly1305_init_ext_avx2
+poly1305_init_ext_avx2_local:
+pushq %r12
+pushq %r13
+pushq %r14
+pushq %r15
+pushq %rbx
+movq %rdi, %r10
+vpxor %ymm0, %ymm0, %ymm0
+movq %rdx, %r12
+vpxor %xmm1, %xmm1, %xmm1
+vmovdqu %xmm1, 64(%r10)
+vmovdqu %ymm0, (%r10)
+vmovdqu %ymm0, 32(%r10)
+movq $-1, %r8
+testq %r12, %r12
+movq 8(%rsi), %rdi
+movq $0xffc0fffffff, %r9
+movq %rdi, %rcx
+cmove %r8, %r12
+movq (%rsi), %r8
+andq %r8, %r9
+shrq $44, %r8
+movq $0xfffffc0ffff, %r11
+shlq $20, %rcx
+shrq $24, %rdi
+orq %rcx, %r8
+movq $0xffffffc0f, %rcx
+andq %r11, %r8
+andq %rcx, %rdi
+movq 16(%rsi), %rcx
+movq %rcx, 160(%r10)
+movq %r9, %rcx
+movq 24(%rsi), %rdx
+movq %rdx, 168(%r10)
+movl %r9d, %edx
+andl $67108863, %edx
+movl %edx, 80(%r10)
+movq %r8, %rdx
+shrq $26, %rcx
+shlq $18, %rdx
+orq %rdx, %rcx
+movq %r8, %rdx
+shrq $8, %rdx
+andl $67108863, %ecx
+andl $67108863, %edx
+movl %ecx, 84(%r10)
+movq %r8, %rcx
+movl %edx, 88(%r10)
+movq %rdi, %rdx
+shrq $34, %rcx
+shlq $10, %rdx
+orq %rdx, %rcx
+movq %rdi, %rdx
+shrq $16, %rdx
+andl $67108863, %ecx
+movl %ecx, 92(%r10)
+movl %edx, 96(%r10)
+cmpq $16, %r12
+jbe poly1305_init_ext_avx2_7
+poly1305_init_ext_avx2_2:
+movq %r9, %rax
+lea (%rdi,%rdi,4), %r14
+mulq %r9
+shlq $2, %r14
+movq %rax, %r11
+movq %rdx, %r15
+lea (%r8,%r8), %rax
+mulq %r14
+addq %rax, %r11
+lea (%r9,%r9), %rax
+movq %r11, %rsi
+adcq %rdx, %r15
+mulq %r8
+movq %rax, %rbx
+movq %r14, %rax
+movq %rdx, %rcx
+lea (%rdi,%rdi), %r14
+mulq %rdi
+addq %rax, %rbx
+movq %r8, %rax
+adcq %rdx, %rcx
+mulq %r8
+shlq $20, %r15
+movq %rax, %r13
+shrq $44, %rsi
+movq %r9, %rax
+orq %rsi, %r15
+movq %rdx, %rsi
+mulq %r14
+addq %r15, %rbx
+movq %rbx, %r15
+adcq $0, %rcx
+addq %rax, %r13
+adcq %rdx, %rsi
+shlq $20, %rcx
+shrq $44, %r15
+orq %r15, %rcx
+addq %rcx, %r13
+movq $0xfffffffffff, %rcx
+movq %r13, %rdx
+adcq $0, %rsi
+andq %rcx, %r11
+shlq $22, %rsi
+andq %rcx, %rbx
+shrq $42, %rdx
+orq %rdx, %rsi
+lea (%rsi,%rsi,4), %rsi
+addq %rsi, %r11
+movq %rcx, %rsi
+andq %r11, %rsi
+shrq $44, %r11
+addq %r11, %rbx
+movq $0x3ffffffffff, %r11
+andq %rbx, %rcx
+andq %r11, %r13
+shrq $44, %rbx
+movq %rsi, %r11
+movq %rcx, %rdx
+addq %r13, %rbx
+shrq $26, %r11
+movq %rbx, %r15
+shlq $18, %rdx
+movq %rcx, %r14
+orq %rdx, %r11
+movq %rcx, %rdx
+shrq $34, %rdx
+movl %esi, %r13d
+shlq $10, %r15
+andl $67108863, %r13d
+orq %r15, %rdx
+andl $67108863, %r11d
+shrq $8, %r14
+andl $67108863, %edx
+movl %edx, 112(%r10)
+movq %rbx, %rdx
+shrq $16, %rdx
+andl $67108863, %r14d
+movl %r13d, 100(%r10)
+movl %r11d, 104(%r10)
+movl %r14d, 108(%r10)
+movl %edx, 116(%r10)
+cmpq $48, %r12
+jbe poly1305_init_ext_avx2_4
+poly1305_init_ext_avx2_3:
+movq %rsi, %rax
+lea (%rbx,%rbx,4), %r15
+mulq %rsi
+shlq $2, %r15
+movq %rax, %r13
+movq %rdx, %r12
+lea (%rcx,%rcx), %rax
+mulq %r15
+addq %rax, %r13
+lea (%rsi,%rsi), %rax
+movq %r15, -16(%rsp)
+adcq %rdx, %r12
+mulq %rcx
+movq %rax, %r14
+movq %rbx, %rax
+movq %rdx, %r11
+mulq %r15
+addq %rax, %r14
+movq %rcx, %rax
+movq %r13, %r15
+adcq %rdx, %r11
+mulq %rcx
+shlq $20, %r12
+shrq $44, %r15
+orq %r15, %r12
+movq %rax, %r15
+addq %r12, %r14
+movq %rdx, %r12
+movq %rsi, %rax
+lea (%rbx,%rbx), %rdx
+adcq $0, %r11
+mulq %rdx
+addq %rax, %r15
+adcq %rdx, %r12
+movq %r14, %rdx
+shlq $20, %r11
+shrq $44, %rdx
+orq %rdx, %r11
+addq %r11, %r15
+movq $0xfffffffffff, %r11
+movq %r15, %rdx
+adcq $0, %r12
+andq %r11, %r13
+shlq $22, %r12
+andq %r11, %r14
+shrq $42, %rdx
+orq %rdx, %r12
+lea (%r12,%r12,4), %r12
+addq %r12, %r13
+movq %r11, %r12
+andq %r13, %r12
+shrq $44, %r13
+addq %r13, %r14
+movq $0x3ffffffffff, %r13
+andq %r14, %r11
+andq %r13, %r15
+shrq $44, %r14
+movq %r11, %rdx
+shlq $18, %rdx
+addq %r14, %r15
+movl %r12d, %r14d
+movq %r11, %r13
+shrq $26, %r12
+andl $67108863, %r14d
+orq %rdx, %r12
+movq %r15, %rdx
+shrq $34, %r11
+shlq $10, %rdx
+andl $67108863, %r12d
+orq %rdx, %r11
+shrq $8, %r13
+andl $67108863, %r11d
+movl %r11d, 152(%r10)
+andl $67108863, %r13d
+shrq $16, %r15
+movl %r14d, 140(%r10)
+movl %r12d, 144(%r10)
+movl %r13d, 148(%r10)
+movl %r15d, 156(%r10)
+movq -16(%rsp), %r11
+jmp poly1305_init_ext_avx2_6
+poly1305_init_ext_avx2_4:
+cmpq $32, %r12
+jbe poly1305_init_ext_avx2_7
+poly1305_init_ext_avx2_5:
+lea (%rbx,%rbx,4), %r11
+shlq $2, %r11
+poly1305_init_ext_avx2_6:
+movq %r9, %rax
+lea (%rcx,%rcx,4), %r13
+mulq %rsi
+shlq $2, %r13
+movq %rax, %r14
+movq %rdi, %rax
+movq %rdx, %r12
+mulq %r13
+addq %rax, %r14
+movq %r8, %rax
+adcq %rdx, %r12
+mulq %r11
+addq %rax, %r14
+movq %r8, %rax
+adcq %rdx, %r12
+mulq %rsi
+movq %rax, %r15
+movq %r9, %rax
+movq %rdx, %r13
+mulq %rcx
+addq %rax, %r15
+movq %r11, %rax
+movq %r14, %r11
+adcq %rdx, %r13
+mulq %rdi
+addq %rax, %r15
+movq %rdi, %rax
+adcq %rdx, %r13
+mulq %rsi
+shlq $20, %r12
+movq %rax, %rsi
+shrq $44, %r11
+movq %r8, %rax
+orq %r11, %r12
+movq %rdx, %rdi
+mulq %rcx
+addq %r12, %r15
+movq %r15, %rcx
+adcq $0, %r13
+addq %rax, %rsi
+movq %r9, %rax
+movq $0xfffffffffff, %r9
+adcq %rdx, %rdi
+andq %r9, %r14
+mulq %rbx
+addq %rax, %rsi
+adcq %rdx, %rdi
+movq %r9, %rdx
+shlq $20, %r13
+andq %r9, %r15
+shrq $44, %rcx
+orq %rcx, %r13
+addq %r13, %rsi
+movq %rsi, %rbx
+adcq $0, %rdi
+shlq $22, %rdi
+shrq $42, %rbx
+orq %rbx, %rdi
+lea (%rdi,%rdi,4), %r8
+addq %r8, %r14
+andq %r14, %rdx
+shrq $44, %r14
+addq %r14, %r15
+movq $0x3ffffffffff, %r14
+andq %r15, %r9
+andq %r14, %rsi
+shrq $44, %r15
+movq %r9, %rax
+addq %r15, %rsi
+movl %edx, %r15d
+movq %rsi, %rbx
+movq %r9, %rcx
+shrq $26, %rdx
+andl $67108863, %r15d
+shlq $18, %rax
+shrq $34, %r9
+orq %rax, %rdx
+shlq $10, %rbx
+shrq $8, %rcx
+orq %rbx, %r9
+shrq $16, %rsi
+andl $67108863, %edx
+andl $67108863, %ecx
+andl $67108863, %r9d
+movl %r15d, 120(%r10)
+movl %edx, 124(%r10)
+movl %ecx, 128(%r10)
+movl %r9d, 132(%r10)
+movl %esi, 136(%r10)
+poly1305_init_ext_avx2_7:
+movq $0, 176(%r10)
+vzeroupper
+popq %rbx
+popq %r15
+popq %r14
+popq %r13
+popq %r12
+ret
+FN_END poly1305_init_ext_avx2
+