#include "../chacha20/macro.S" #include "constants.S" SECTION_TEXT GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0 movl $64, %eax ret FN_END poly1305_block_size_avx2 GLOBAL_HIDDEN_FN poly1305_auth_avx2 /* cmp $128, %rdx jb poly1305_auth_x86_local */ pushq %rbp movq %rsp, %rbp andq $-64, %rsp pushq %r12 pushq %r14 pushq %r15 pushq %rbx subq $224, %rsp movq %rsi, %r14 movq %rdi, %rbx lea (%rsp), %rdi movq %rcx, %rsi movq %rdx, %r12 call poly1305_init_ext_avx2_local poly1305_auth_avx2_2: movq %r12, %r15 andq $-64, %r15 je poly1305_auth_avx2_5 poly1305_auth_avx2_3: movq %r14, %rsi lea (%rsp), %rdi movq %r15, %rdx call poly1305_blocks_avx2_local poly1305_auth_avx2_4: addq %r15, %r14 subq %r15, %r12 poly1305_auth_avx2_5: movq %r14, %rsi lea (%rsp), %rdi movq %r12, %rdx movq %rbx, %rcx call poly1305_finish_ext_avx2_local poly1305_auth_avx2_6: addq $224, %rsp popq %rbx popq %r15 popq %r14 popq %r12 movq %rbp, %rsp popq %rbp ret FN_END poly1305_auth_avx2 GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2 poly1305_finish_ext_avx2_local: pushq %rbp movq %rsp, %rbp andq $-64, %rsp pushq %r12 pushq %r13 pushq %r14 subq $104, %rsp movq %rdx, %r13 movq %rcx, %r14 movq %rdi, %r12 testq %r13, %r13 je poly1305_finish_ext_avx2_29 poly1305_finish_ext_avx2_2: lea (%rsp), %rax vpxor %ymm0, %ymm0, %ymm0 subq %rax, %rsi vmovdqu %ymm0, (%rsp) vmovdqu %ymm0, 32(%rsp) testq $32, %r13 je poly1305_finish_ext_avx2_4 poly1305_finish_ext_avx2_3: vmovdqu (%rsp,%rsi), %ymm0 lea 32(%rsp), %rax vmovdqu %ymm0, (%rsp) poly1305_finish_ext_avx2_4: testq $16, %r13 je poly1305_finish_ext_avx2_6 poly1305_finish_ext_avx2_5: vmovdqu (%rax,%rsi), %xmm0 vmovdqu %xmm0, (%rax) addq $16, %rax poly1305_finish_ext_avx2_6: testq $8, %r13 je poly1305_finish_ext_avx2_8 poly1305_finish_ext_avx2_7: movq (%rax,%rsi), %rdx movq %rdx, (%rax) addq $8, %rax poly1305_finish_ext_avx2_8: testq $4, %r13 je poly1305_finish_ext_avx2_10 poly1305_finish_ext_avx2_9: movl (%rax,%rsi), %edx movl %edx, (%rax) addq $4, %rax poly1305_finish_ext_avx2_10: testq $2, %r13 je poly1305_finish_ext_avx2_12 poly1305_finish_ext_avx2_11: movzwl (%rax,%rsi), %edx movw %dx, (%rax) addq $2, %rax poly1305_finish_ext_avx2_12: testq $1, %r13 je poly1305_finish_ext_avx2_14 poly1305_finish_ext_avx2_13: movb (%rax,%rsi), %dl movb %dl, (%rax) poly1305_finish_ext_avx2_14: testq $15, %r13 je poly1305_finish_ext_avx2_16 poly1305_finish_ext_avx2_15: movb $1, (%rsp,%r13) poly1305_finish_ext_avx2_16: movq 176(%r12), %rdx andq $-8125, %rdx cmpq $48, %r13 jb poly1305_finish_ext_avx2_18 poly1305_finish_ext_avx2_17: orq $4, %rdx jmp poly1305_finish_ext_avx2_21 poly1305_finish_ext_avx2_18: cmpq $32, %r13 jb poly1305_finish_ext_avx2_20 poly1305_finish_ext_avx2_19: orq $8, %rdx jmp poly1305_finish_ext_avx2_21 poly1305_finish_ext_avx2_20: movq %rdx, %rax orq $32, %rdx orq $16, %rax cmpq $16, %r13 cmovae %rax, %rdx poly1305_finish_ext_avx2_21: testq $1, %rdx je poly1305_finish_ext_avx2_27 poly1305_finish_ext_avx2_22: cmpq $16, %r13 ja poly1305_finish_ext_avx2_24 poly1305_finish_ext_avx2_23: orq $256, %rdx movq %rdx, 176(%r12) jmp poly1305_finish_ext_avx2_28 poly1305_finish_ext_avx2_24: cmpq $32, %r13 ja poly1305_finish_ext_avx2_27 poly1305_finish_ext_avx2_25: orq $128, %rdx movq %rdx, 176(%r12) jmp poly1305_finish_ext_avx2_28 poly1305_finish_ext_avx2_27: movq %rdx, 176(%r12) poly1305_finish_ext_avx2_28: movq %r12, %rdi lea (%rsp), %rsi movl $64, %edx vzeroupper call poly1305_blocks_avx2_local poly1305_finish_ext_avx2_29: movq 176(%r12), %rdx testq $1, %rdx je poly1305_finish_ext_avx2_37 poly1305_finish_ext_avx2_30: andq $-8125, %rdx testq %r13, %r13 je poly1305_finish_ext_avx2_32 poly1305_finish_ext_avx2_31: cmpq $48, %r13 jbe poly1305_finish_ext_avx2_33 poly1305_finish_ext_avx2_32: orq $512, %rdx jmp poly1305_finish_ext_avx2_36 poly1305_finish_ext_avx2_33: cmpq $32, %r13 jbe poly1305_finish_ext_avx2_35 poly1305_finish_ext_avx2_34: orq $1024, %rdx jmp poly1305_finish_ext_avx2_36 poly1305_finish_ext_avx2_35: movq %rdx, %rax orq $4096, %rdx orq $2048, %rax cmpq $16, %r13 cmova %rax, %rdx poly1305_finish_ext_avx2_36: orq $96, %rdx movq %r12, %rdi vpxor %ymm0, %ymm0, %ymm0 lea (%rsp), %rsi movq %rdx, 176(%r12) movl $64, %edx vmovdqu %ymm0, (%rsp) vmovdqu %ymm0, 32(%rsp) vzeroupper call poly1305_blocks_avx2_local poly1305_finish_ext_avx2_37: movq 8(%r12), %r8 movq %r8, %rsi movq 16(%r12), %rax vpxor %ymm0, %ymm0, %ymm0 shlq $44, %rsi shrq $20, %r8 shlq $24, %rax orq (%r12), %rsi orq %rax, %r8 movq 160(%r12), %rdx movq 168(%r12), %rcx addq %rdx, %rsi adcq %rcx, %r8 vmovdqu %ymm0, (%r12) vmovdqu %ymm0, 32(%r12) vmovdqu %ymm0, 64(%r12) vmovdqu %ymm0, 96(%r12) vmovdqu %ymm0, 128(%r12) vmovdqu %ymm0, 160(%r12) movq %rsi, (%r14) movq %r8, 8(%r14) vzeroupper addq $104, %rsp popq %r14 popq %r13 popq %r12 movq %rbp, %rsp popq %rbp ret FN_END poly1305_finish_ext_avx2 GLOBAL_HIDDEN_FN poly1305_blocks_avx2 poly1305_blocks_avx2_local: pushq %rbp movq %rsp, %rbp andq $-64, %rsp subq $384, %rsp movl $16777216, %eax movl $67108863, %ecx movl $5, %r8d vmovd %eax, %xmm1 vmovd %ecx, %xmm10 vmovd %r8d, %xmm0 movq 176(%rdi), %rax vpbroadcastq %xmm1, %ymm1 vpbroadcastq %xmm10, %ymm10 vpbroadcastq %xmm0, %ymm11 testq $60, %rax je poly1305_blocks_avx2_11 poly1305_blocks_avx2_2: vpsrldq $8, %ymm1, %ymm15 testq $4, %rax je poly1305_blocks_avx2_4 poly1305_blocks_avx2_3: vpermq $192, %ymm15, %ymm15 poly1305_blocks_avx2_4: testq $8, %rax je poly1305_blocks_avx2_6 poly1305_blocks_avx2_5: vpermq $240, %ymm15, %ymm15 poly1305_blocks_avx2_6: testq $16, %rax je poly1305_blocks_avx2_8 poly1305_blocks_avx2_7: vpermq $252, %ymm15, %ymm15 poly1305_blocks_avx2_8: testq $32, %rax je poly1305_blocks_avx2_10 poly1305_blocks_avx2_9: vpxor %ymm15, %ymm15, %ymm15 poly1305_blocks_avx2_10: vmovdqa %ymm15, %ymm1 poly1305_blocks_avx2_11: movq %rax, %rcx btsq $0, %rcx jc poly1305_blocks_avx2_13 poly1305_blocks_avx2_12: vmovdqu (%rsi), %ymm3 movq %rcx, %rax vmovdqu 32(%rsi), %ymm5 vpunpcklqdq %ymm5, %ymm3, %ymm4 addq $64, %rsi vpunpckhqdq %ymm5, %ymm3, %ymm7 vpermq $216, %ymm4, %ymm6 addq $-64, %rdx vpermq $216, %ymm7, %ymm0 vpsrlq $52, %ymm6, %ymm8 vpsllq $12, %ymm0, %ymm9 vpsrlq $26, %ymm6, %ymm2 vpsrlq $40, %ymm0, %ymm0 vpand %ymm6, %ymm10, %ymm4 vpor %ymm9, %ymm8, %ymm7 vpand %ymm2, %ymm10, %ymm3 vpor %ymm1, %ymm0, %ymm9 vpsrlq $26, %ymm7, %ymm2 vpand %ymm7, %ymm10, %ymm5 vpand %ymm2, %ymm10, %ymm7 movq %rax, 176(%rdi) jmp poly1305_blocks_avx2_14 poly1305_blocks_avx2_13: vpermq $216, (%rdi), %ymm15 vpxor %ymm0, %ymm0, %ymm0 vpermq $216, 32(%rdi), %ymm14 vpermq $216, 64(%rdi), %ymm13 vpunpckldq %ymm0, %ymm15, %ymm4 vpunpckhdq %ymm0, %ymm15, %ymm3 vpunpckldq %ymm0, %ymm14, %ymm5 vpunpckhdq %ymm0, %ymm14, %ymm7 vpunpckldq %ymm0, %ymm13, %ymm9 poly1305_blocks_avx2_14: cmpq $64, %rdx jb poly1305_blocks_avx2_34 poly1305_blocks_avx2_15: vmovdqu 140(%rdi), %ymm0 testq $8064, %rax je poly1305_blocks_avx2_29 poly1305_blocks_avx2_16: vpermq $216, 80(%rdi), %ymm6 vpermq $216, 100(%rdi), %ymm2 vpermq $216, 120(%rdi), %ymm8 vpermq $216, %ymm0, %ymm0 testq $128, %rax je poly1305_blocks_avx2_18 poly1305_blocks_avx2_17: vmovdqa %ymm0, %ymm15 vmovdqa %ymm0, %ymm14 vmovdqa %ymm0, %ymm13 vmovdqa %ymm8, %ymm12 jmp poly1305_blocks_avx2_28 poly1305_blocks_avx2_18: testq $256, %rax je poly1305_blocks_avx2_20 poly1305_blocks_avx2_19: vmovdqa %ymm0, %ymm15 vmovdqa %ymm0, %ymm14 vmovdqa %ymm8, %ymm13 vmovdqa %ymm2, %ymm12 jmp poly1305_blocks_avx2_28 poly1305_blocks_avx2_20: testq $512, %rax je poly1305_blocks_avx2_22 poly1305_blocks_avx2_21: vmovdqa %ymm0, %ymm15 vmovdqa %ymm8, %ymm14 vmovdqa %ymm2, %ymm13 vmovdqa %ymm6, %ymm12 jmp poly1305_blocks_avx2_28 poly1305_blocks_avx2_22: testq $1024, %rax je poly1305_blocks_avx2_24 poly1305_blocks_avx2_23: vpxor %ymm12, %ymm12, %ymm12 movl $1, %r8d vmovdqa %ymm8, %ymm15 vmovdqa %ymm2, %ymm14 vmovdqa %ymm6, %ymm13 vmovd %r8d, %xmm12 jmp poly1305_blocks_avx2_28 poly1305_blocks_avx2_24: testq $2048, %rax je poly1305_blocks_avx2_26 poly1305_blocks_avx2_25: vpxor %ymm12, %ymm12, %ymm12 movl $1, %r8d vmovd %r8d, %xmm13 vmovdqa %ymm2, %ymm15 vmovdqa %ymm6, %ymm14 vmovdqa %ymm13, %ymm12 jmp poly1305_blocks_avx2_28 poly1305_blocks_avx2_26: testq $4096, %rax je poly1305_blocks_avx2_28 poly1305_blocks_avx2_27: movl $1, %r8d vmovd %r8d, %xmm14 vmovdqa %ymm6, %ymm15 vmovdqa %ymm14, %ymm13 vmovdqa %ymm14, %ymm12 poly1305_blocks_avx2_28: vpunpcklqdq %ymm14, %ymm15, %ymm6 vpunpcklqdq %ymm12, %ymm13, %ymm8 vpunpckhqdq %ymm14, %ymm15, %ymm14 vpunpckhqdq %ymm12, %ymm13, %ymm12 vperm2i128 $32, %ymm8, %ymm6, %ymm2 vperm2i128 $49, %ymm8, %ymm6, %ymm6 vpsrlq $32, %ymm6, %ymm0 vpsrlq $32, %ymm2, %ymm8 vmovdqu %ymm0, 352(%rsp) vperm2i128 $32, %ymm12, %ymm14, %ymm13 vmovdqu %ymm13, 320(%rsp) jmp poly1305_blocks_avx2_30 poly1305_blocks_avx2_29: vpsrlq $32, %ymm0, %ymm12 vpermq $0, %ymm0, %ymm2 vpermq $85, %ymm0, %ymm6 vpermq $85, %ymm12, %ymm13 vpermq $170, %ymm0, %ymm0 vpermq $0, %ymm12, %ymm8 vmovdqu %ymm13, 352(%rsp) vmovdqu %ymm0, 320(%rsp) poly1305_blocks_avx2_30: vmovdqu (%rsi), %ymm12 movq %rdx, %r9 vmovdqu 352(%rsp), %ymm15 vmovdqu %ymm1, 160(%rsp) vmovdqu %ymm10, 192(%rsp) vmovdqu %ymm11, 128(%rsp) vperm2i128 $32, 32(%rsi), %ymm12, %ymm13 xorl %r8d, %r8d vperm2i128 $49, 32(%rsi), %ymm12, %ymm12 xorl %ecx, %ecx vpmuludq %ymm11, %ymm8, %ymm0 vpmuludq %ymm11, %ymm6, %ymm1 vmovdqu %ymm0, 224(%rsp) vmovdqu %ymm1, 256(%rsp) vpunpckldq %ymm12, %ymm13, %ymm14 vpunpckhdq %ymm12, %ymm13, %ymm12 vmovdqu %ymm14, 32(%rsp) vpmuludq %ymm0, %ymm9, %ymm0 vpmuludq %ymm1, %ymm7, %ymm13 vpaddq %ymm13, %ymm0, %ymm0 vpmuludq %ymm11, %ymm15, %ymm10 vpmuludq %ymm10, %ymm5, %ymm13 vpaddq %ymm13, %ymm0, %ymm0 vmovdqu %ymm10, 288(%rsp) vpmuludq 320(%rsp), %ymm11, %ymm11 vpmuludq %ymm11, %ymm3, %ymm13 vpaddq %ymm13, %ymm0, %ymm0 vmovdqu %ymm11, (%rsp) vpmuludq %ymm2, %ymm4, %ymm13 vpaddq %ymm13, %ymm0, %ymm0 vpxor %ymm13, %ymm13, %ymm13 vpunpckldq %ymm13, %ymm14, %ymm14 vpaddq %ymm14, %ymm0, %ymm0 vmovdqu %ymm0, 64(%rsp) vpmuludq %ymm11, %ymm9, %ymm14 vpmuludq %ymm2, %ymm7, %ymm0 vpaddq %ymm0, %ymm14, %ymm14 vpmuludq %ymm8, %ymm5, %ymm0 vpaddq %ymm0, %ymm14, %ymm14 vpmuludq %ymm6, %ymm3, %ymm0 vpaddq %ymm0, %ymm14, %ymm14 vpmuludq %ymm15, %ymm4, %ymm0 vpaddq %ymm0, %ymm14, %ymm0 vpunpckhdq %ymm13, %ymm12, %ymm14 vpsllq $18, %ymm14, %ymm14 vpaddq %ymm14, %ymm0, %ymm14 vpmuludq %ymm1, %ymm9, %ymm1 vpmuludq %ymm10, %ymm7, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpmuludq %ymm11, %ymm5, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpmuludq %ymm2, %ymm3, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpmuludq %ymm8, %ymm4, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vmovdqu 32(%rsp), %ymm0 vpunpckhdq %ymm13, %ymm0, %ymm0 vpsllq $6, %ymm0, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vmovdqu 64(%rsp), %ymm0 vpsrlq $26, %ymm0, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vmovdqu %ymm1, 96(%rsp) vpmuludq %ymm2, %ymm9, %ymm1 vpmuludq %ymm8, %ymm7, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpmuludq %ymm10, %ymm9, %ymm10 vpmuludq %ymm11, %ymm7, %ymm11 vpaddq %ymm11, %ymm10, %ymm7 vpmuludq %ymm6, %ymm5, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpmuludq %ymm2, %ymm5, %ymm5 vpaddq %ymm5, %ymm7, %ymm10 vpmuludq %ymm15, %ymm3, %ymm15 vpaddq %ymm15, %ymm1, %ymm1 vpmuludq %ymm8, %ymm3, %ymm11 vpaddq %ymm11, %ymm10, %ymm5 vpunpckldq %ymm13, %ymm12, %ymm10 vmovdqu 96(%rsp), %ymm12 vpmuludq 320(%rsp), %ymm4, %ymm0 vpaddq %ymm0, %ymm1, %ymm15 vpsrlq $26, %ymm12, %ymm3 vmovdqu 160(%rsp), %ymm1 vpmuludq %ymm6, %ymm4, %ymm4 vpaddq %ymm1, %ymm15, %ymm0 vpsrlq $26, %ymm14, %ymm15 vpaddq %ymm4, %ymm5, %ymm11 vpsllq $12, %ymm10, %ymm4 vmovdqu 192(%rsp), %ymm10 vpaddq %ymm15, %ymm0, %ymm0 vpaddq %ymm4, %ymm11, %ymm5 vmovdqu 128(%rsp), %ymm11 vpsrlq $26, %ymm0, %ymm9 vpaddq %ymm3, %ymm5, %ymm7 vpand 64(%rsp), %ymm10, %ymm13 vpand %ymm10, %ymm12, %ymm12 vpand %ymm10, %ymm7, %ymm5 vpsrlq $26, %ymm7, %ymm7 vpmuludq %ymm11, %ymm9, %ymm15 vpand %ymm10, %ymm14, %ymm9 vpaddq %ymm15, %ymm13, %ymm3 vpand %ymm10, %ymm0, %ymm14 vpaddq %ymm7, %ymm9, %ymm9 vpand %ymm10, %ymm3, %ymm4 vpsrlq $26, %ymm3, %ymm3 vpsrlq $26, %ymm9, %ymm0 vpand %ymm10, %ymm9, %ymm7 vpaddq %ymm3, %ymm12, %ymm3 vpaddq %ymm0, %ymm14, %ymm9 sarq $5, %r9 shrq $58, %r9 addq %rdx, %r9 sarq $6, %r9 cmpq $2, %r9 jl poly1305_blocks_avx2_34 poly1305_blocks_avx2_31: vmovdqu %ymm6, 32(%rsp) lea -64(%rdx), %r9 vmovdqu %ymm8, 64(%rsp) vmovdqu %ymm11, 128(%rsp) vmovdqu %ymm10, 192(%rsp) vmovdqu %ymm1, 160(%rsp) vmovdqu (%rsp), %ymm12 sarq $5, %r9 shrq $58, %r9 lea -64(%rdx,%r9), %rdx sarq $6, %rdx poly1305_blocks_avx2_32: vmovdqu 256(%rsp), %ymm15 incq %r8 vmovdqu 64(%rcx,%rsi), %ymm11 vpmuludq 224(%rsp), %ymm9, %ymm8 vpmuludq %ymm15, %ymm7, %ymm14 vpaddq %ymm14, %ymm8, %ymm1 vmovdqu 288(%rsp), %ymm8 vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10 vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6 addq $64, %rcx vpmuludq %ymm8, %ymm5, %ymm13 vpunpckldq %ymm6, %ymm10, %ymm0 vpunpckhdq %ymm6, %ymm10, %ymm11 vpaddq %ymm13, %ymm1, %ymm10 vpmuludq %ymm12, %ymm3, %ymm6 vpaddq %ymm6, %ymm10, %ymm14 vpxor %ymm10, %ymm10, %ymm10 vpunpckldq %ymm10, %ymm0, %ymm6 vpunpckhdq %ymm10, %ymm0, %ymm0 vpmuludq %ymm2, %ymm4, %ymm1 vpaddq %ymm1, %ymm14, %ymm13 vpaddq %ymm6, %ymm13, %ymm1 vmovdqu 64(%rsp), %ymm6 vmovdqu %ymm1, (%rsp) vpsrlq $26, %ymm1, %ymm1 vpmuludq %ymm12, %ymm9, %ymm14 vpmuludq %ymm2, %ymm7, %ymm13 vpaddq %ymm13, %ymm14, %ymm14 vpmuludq %ymm6, %ymm5, %ymm13 vpaddq %ymm13, %ymm14, %ymm14 vpmuludq 32(%rsp), %ymm3, %ymm13 vpaddq %ymm13, %ymm14, %ymm14 vpmuludq 352(%rsp), %ymm4, %ymm13 vpaddq %ymm13, %ymm14, %ymm13 vpunpckhdq %ymm10, %ymm11, %ymm14 vpsllq $18, %ymm14, %ymm14 vpaddq %ymm14, %ymm13, %ymm13 vpmuludq %ymm15, %ymm9, %ymm15 vpmuludq %ymm8, %ymm7, %ymm14 vpaddq %ymm14, %ymm15, %ymm15 vpmuludq %ymm12, %ymm5, %ymm14 vpaddq %ymm14, %ymm15, %ymm15 vpmuludq %ymm2, %ymm3, %ymm14 vpaddq %ymm14, %ymm15, %ymm15 vpmuludq %ymm6, %ymm4, %ymm14 vpaddq %ymm14, %ymm15, %ymm14 vpsllq $6, %ymm0, %ymm15 vpaddq %ymm15, %ymm14, %ymm14 vmovdqu 32(%rsp), %ymm15 vpaddq %ymm1, %ymm14, %ymm1 vpmuludq %ymm2, %ymm9, %ymm0 vpmuludq %ymm6, %ymm7, %ymm14 vpmuludq %ymm8, %ymm9, %ymm9 vpmuludq %ymm12, %ymm7, %ymm7 vpaddq %ymm7, %ymm9, %ymm7 vpaddq %ymm14, %ymm0, %ymm0 vpsrlq $26, %ymm1, %ymm9 vpmuludq %ymm15, %ymm5, %ymm14 vpmuludq %ymm2, %ymm5, %ymm5 vpaddq %ymm5, %ymm7, %ymm5 vpaddq %ymm14, %ymm0, %ymm0 vpmuludq 352(%rsp), %ymm3, %ymm14 vpmuludq %ymm6, %ymm3, %ymm3 vpaddq %ymm3, %ymm5, %ymm5 vpaddq %ymm14, %ymm0, %ymm0 vpmuludq 320(%rsp), %ymm4, %ymm14 vpmuludq %ymm15, %ymm4, %ymm4 vpaddq %ymm4, %ymm5, %ymm5 vpaddq %ymm14, %ymm0, %ymm0 vpunpckldq %ymm10, %ymm11, %ymm4 vpaddq 160(%rsp), %ymm0, %ymm14 vpsrlq $26, %ymm13, %ymm0 vpsllq $12, %ymm4, %ymm3 vpaddq %ymm0, %ymm14, %ymm14 vpaddq %ymm3, %ymm5, %ymm7 vpsrlq $26, %ymm14, %ymm0 vpaddq %ymm9, %ymm7, %ymm10 vmovdqu 192(%rsp), %ymm9 vpsrlq $26, %ymm10, %ymm11 vpand (%rsp), %ymm9, %ymm6 vpand %ymm9, %ymm13, %ymm13 vpand %ymm9, %ymm1, %ymm1 vpand %ymm9, %ymm14, %ymm14 vpand %ymm9, %ymm10, %ymm5 vpmuludq 128(%rsp), %ymm0, %ymm8 vpaddq %ymm8, %ymm6, %ymm15 vpaddq %ymm11, %ymm13, %ymm0 vpsrlq $26, %ymm15, %ymm3 vpand %ymm9, %ymm0, %ymm7 vpsrlq $26, %ymm0, %ymm0 vpand %ymm9, %ymm15, %ymm4 vpaddq %ymm3, %ymm1, %ymm3 vpaddq %ymm0, %ymm14, %ymm9 cmpq %rdx, %r8 jb poly1305_blocks_avx2_32 poly1305_blocks_avx2_34: testq $64, %rax jne poly1305_blocks_avx2_36 poly1305_blocks_avx2_35: vpshufd $8, %ymm4, %ymm0 vpshufd $8, %ymm3, %ymm3 vpshufd $8, %ymm5, %ymm5 vpshufd $8, %ymm7, %ymm7 vpshufd $8, %ymm9, %ymm9 vpermq $8, %ymm0, %ymm1 vpermq $8, %ymm3, %ymm2 vpermq $8, %ymm5, %ymm4 vpermq $8, %ymm7, %ymm6 vpermq $8, %ymm9, %ymm11 vperm2i128 $32, %ymm2, %ymm1, %ymm8 vperm2i128 $32, %ymm6, %ymm4, %ymm10 vmovdqu %ymm8, (%rdi) vmovdqu %ymm10, 32(%rdi) vmovdqu %xmm11, 64(%rdi) jmp poly1305_blocks_avx2_37 poly1305_blocks_avx2_36: vpermq $245, %ymm4, %ymm0 vpaddq %ymm0, %ymm4, %ymm4 vpermq $245, %ymm3, %ymm1 vpaddq %ymm1, %ymm3, %ymm10 vpermq $245, %ymm5, %ymm3 vpermq $170, %ymm4, %ymm6 vpaddq %ymm3, %ymm5, %ymm13 vpaddq %ymm6, %ymm4, %ymm8 vpermq $170, %ymm10, %ymm11 vpermq $245, %ymm7, %ymm5 vpaddq %ymm11, %ymm10, %ymm12 vpaddq %ymm5, %ymm7, %ymm7 vpermq $170, %ymm13, %ymm14 vpermq $245, %ymm9, %ymm2 vpaddq %ymm14, %ymm13, %ymm15 vpaddq %ymm2, %ymm9, %ymm9 vpermq $170, %ymm7, %ymm0 vpaddq %ymm0, %ymm7, %ymm1 vpermq $170, %ymm9, %ymm2 vpaddq %ymm2, %ymm9, %ymm3 vmovd %xmm8, %r9d movl %r9d, %r8d shrl $26, %r8d andq $67108863, %r9 vmovd %xmm12, %esi addl %r8d, %esi movl %esi, %r11d shrl $26, %esi andq $67108863, %r11 vmovd %xmm15, %ecx addl %esi, %ecx movl %ecx, %eax shrl $26, %eax andq $67108863, %rcx shlq $8, %rcx vmovd %xmm1, %r8d addl %eax, %r8d movl %r8d, %r10d shrl $26, %r8d andq $67108863, %r10 movq %r10, %rax shrq $10, %rax shlq $34, %r10 vmovd %xmm3, %edx addl %r8d, %edx shlq $16, %rdx orq %rdx, %rax movq %rax, %r8 shrq $42, %r8 lea (%r8,%r8,4), %rdx movq %r11, %r8 shlq $26, %r8 orq %r8, %r9 movq $0xfffffffffff, %r8 shrq $18, %r11 andq %r8, %r9 addq %r9, %rdx orq %rcx, %r11 movq %rdx, %rsi orq %r10, %r11 shrq $44, %rsi andq %r8, %r11 addq %r11, %rsi movq $0x3ffffffffff, %r9 movq %rsi, %r10 andq %r9, %rax shrq $44, %r10 andq %r8, %rdx addq %r10, %rax movq %r8, %rcx andq %rax, %r9 andq %r8, %rsi shrq $42, %rax movq $0xfffffc0000000000, %r10 lea (%rax,%rax,4), %r11 addq %r11, %rdx andq %rdx, %rcx shrq $44, %rdx addq %rdx, %rsi lea 5(%rcx), %rdx movq %rdx, %r11 andq %r8, %rdx shrq $44, %r11 addq %rsi, %r11 movq %r11, %rax andq %r11, %r8 shrq $44, %rax addq %r9, %rax addq %r10, %rax movq %rax, %r10 shrq $63, %r10 decq %r10 andn %rcx, %r10, %rcx andq %r10, %rdx orq %rdx, %rcx andq %r10, %r8 andn %rsi, %r10, %rdx andq %r10, %rax andn %r9, %r10, %rsi orq %r8, %rdx orq %rax, %rsi movq %rcx, (%rdi) movq %rdx, 8(%rdi) movq %rsi, 16(%rdi) poly1305_blocks_avx2_37: vzeroupper movq %rbp, %rsp popq %rbp ret FN_END poly1305_blocks_avx2 GLOBAL_HIDDEN_FN poly1305_init_ext_avx2 poly1305_init_ext_avx2_local: pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdi, %r10 vpxor %ymm0, %ymm0, %ymm0 movq %rdx, %r12 vpxor %xmm1, %xmm1, %xmm1 vmovdqu %xmm1, 64(%r10) vmovdqu %ymm0, (%r10) vmovdqu %ymm0, 32(%r10) movq $-1, %r8 testq %r12, %r12 movq 8(%rsi), %rdi movq $0xffc0fffffff, %r9 movq %rdi, %rcx cmove %r8, %r12 movq (%rsi), %r8 andq %r8, %r9 shrq $44, %r8 movq $0xfffffc0ffff, %r11 shlq $20, %rcx shrq $24, %rdi orq %rcx, %r8 movq $0xffffffc0f, %rcx andq %r11, %r8 andq %rcx, %rdi movq 16(%rsi), %rcx movq %rcx, 160(%r10) movq %r9, %rcx movq 24(%rsi), %rdx movq %rdx, 168(%r10) movl %r9d, %edx andl $67108863, %edx movl %edx, 80(%r10) movq %r8, %rdx shrq $26, %rcx shlq $18, %rdx orq %rdx, %rcx movq %r8, %rdx shrq $8, %rdx andl $67108863, %ecx andl $67108863, %edx movl %ecx, 84(%r10) movq %r8, %rcx movl %edx, 88(%r10) movq %rdi, %rdx shrq $34, %rcx shlq $10, %rdx orq %rdx, %rcx movq %rdi, %rdx shrq $16, %rdx andl $67108863, %ecx movl %ecx, 92(%r10) movl %edx, 96(%r10) cmpq $16, %r12 jbe poly1305_init_ext_avx2_7 poly1305_init_ext_avx2_2: movq %r9, %rax lea (%rdi,%rdi,4), %r14 mulq %r9 shlq $2, %r14 movq %rax, %r11 movq %rdx, %r15 lea (%r8,%r8), %rax mulq %r14 addq %rax, %r11 lea (%r9,%r9), %rax movq %r11, %rsi adcq %rdx, %r15 mulq %r8 movq %rax, %rbx movq %r14, %rax movq %rdx, %rcx lea (%rdi,%rdi), %r14 mulq %rdi addq %rax, %rbx movq %r8, %rax adcq %rdx, %rcx mulq %r8 shlq $20, %r15 movq %rax, %r13 shrq $44, %rsi movq %r9, %rax orq %rsi, %r15 movq %rdx, %rsi mulq %r14 addq %r15, %rbx movq %rbx, %r15 adcq $0, %rcx addq %rax, %r13 adcq %rdx, %rsi shlq $20, %rcx shrq $44, %r15 orq %r15, %rcx addq %rcx, %r13 movq $0xfffffffffff, %rcx movq %r13, %rdx adcq $0, %rsi andq %rcx, %r11 shlq $22, %rsi andq %rcx, %rbx shrq $42, %rdx orq %rdx, %rsi lea (%rsi,%rsi,4), %rsi addq %rsi, %r11 movq %rcx, %rsi andq %r11, %rsi shrq $44, %r11 addq %r11, %rbx movq $0x3ffffffffff, %r11 andq %rbx, %rcx andq %r11, %r13 shrq $44, %rbx movq %rsi, %r11 movq %rcx, %rdx addq %r13, %rbx shrq $26, %r11 movq %rbx, %r15 shlq $18, %rdx movq %rcx, %r14 orq %rdx, %r11 movq %rcx, %rdx shrq $34, %rdx movl %esi, %r13d shlq $10, %r15 andl $67108863, %r13d orq %r15, %rdx andl $67108863, %r11d shrq $8, %r14 andl $67108863, %edx movl %edx, 112(%r10) movq %rbx, %rdx shrq $16, %rdx andl $67108863, %r14d movl %r13d, 100(%r10) movl %r11d, 104(%r10) movl %r14d, 108(%r10) movl %edx, 116(%r10) cmpq $48, %r12 jbe poly1305_init_ext_avx2_4 poly1305_init_ext_avx2_3: movq %rsi, %rax lea (%rbx,%rbx,4), %r15 mulq %rsi shlq $2, %r15 movq %rax, %r13 movq %rdx, %r12 lea (%rcx,%rcx), %rax mulq %r15 addq %rax, %r13 lea (%rsi,%rsi), %rax movq %r15, -16(%rsp) adcq %rdx, %r12 mulq %rcx movq %rax, %r14 movq %rbx, %rax movq %rdx, %r11 mulq %r15 addq %rax, %r14 movq %rcx, %rax movq %r13, %r15 adcq %rdx, %r11 mulq %rcx shlq $20, %r12 shrq $44, %r15 orq %r15, %r12 movq %rax, %r15 addq %r12, %r14 movq %rdx, %r12 movq %rsi, %rax lea (%rbx,%rbx), %rdx adcq $0, %r11 mulq %rdx addq %rax, %r15 adcq %rdx, %r12 movq %r14, %rdx shlq $20, %r11 shrq $44, %rdx orq %rdx, %r11 addq %r11, %r15 movq $0xfffffffffff, %r11 movq %r15, %rdx adcq $0, %r12 andq %r11, %r13 shlq $22, %r12 andq %r11, %r14 shrq $42, %rdx orq %rdx, %r12 lea (%r12,%r12,4), %r12 addq %r12, %r13 movq %r11, %r12 andq %r13, %r12 shrq $44, %r13 addq %r13, %r14 movq $0x3ffffffffff, %r13 andq %r14, %r11 andq %r13, %r15 shrq $44, %r14 movq %r11, %rdx shlq $18, %rdx addq %r14, %r15 movl %r12d, %r14d movq %r11, %r13 shrq $26, %r12 andl $67108863, %r14d orq %rdx, %r12 movq %r15, %rdx shrq $34, %r11 shlq $10, %rdx andl $67108863, %r12d orq %rdx, %r11 shrq $8, %r13 andl $67108863, %r11d movl %r11d, 152(%r10) andl $67108863, %r13d shrq $16, %r15 movl %r14d, 140(%r10) movl %r12d, 144(%r10) movl %r13d, 148(%r10) movl %r15d, 156(%r10) movq -16(%rsp), %r11 jmp poly1305_init_ext_avx2_6 poly1305_init_ext_avx2_4: cmpq $32, %r12 jbe poly1305_init_ext_avx2_7 poly1305_init_ext_avx2_5: lea (%rbx,%rbx,4), %r11 shlq $2, %r11 poly1305_init_ext_avx2_6: movq %r9, %rax lea (%rcx,%rcx,4), %r13 mulq %rsi shlq $2, %r13 movq %rax, %r14 movq %rdi, %rax movq %rdx, %r12 mulq %r13 addq %rax, %r14 movq %r8, %rax adcq %rdx, %r12 mulq %r11 addq %rax, %r14 movq %r8, %rax adcq %rdx, %r12 mulq %rsi movq %rax, %r15 movq %r9, %rax movq %rdx, %r13 mulq %rcx addq %rax, %r15 movq %r11, %rax movq %r14, %r11 adcq %rdx, %r13 mulq %rdi addq %rax, %r15 movq %rdi, %rax adcq %rdx, %r13 mulq %rsi shlq $20, %r12 movq %rax, %rsi shrq $44, %r11 movq %r8, %rax orq %r11, %r12 movq %rdx, %rdi mulq %rcx addq %r12, %r15 movq %r15, %rcx adcq $0, %r13 addq %rax, %rsi movq %r9, %rax movq $0xfffffffffff, %r9 adcq %rdx, %rdi andq %r9, %r14 mulq %rbx addq %rax, %rsi adcq %rdx, %rdi movq %r9, %rdx shlq $20, %r13 andq %r9, %r15 shrq $44, %rcx orq %rcx, %r13 addq %r13, %rsi movq %rsi, %rbx adcq $0, %rdi shlq $22, %rdi shrq $42, %rbx orq %rbx, %rdi lea (%rdi,%rdi,4), %r8 addq %r8, %r14 andq %r14, %rdx shrq $44, %r14 addq %r14, %r15 movq $0x3ffffffffff, %r14 andq %r15, %r9 andq %r14, %rsi shrq $44, %r15 movq %r9, %rax addq %r15, %rsi movl %edx, %r15d movq %rsi, %rbx movq %r9, %rcx shrq $26, %rdx andl $67108863, %r15d shlq $18, %rax shrq $34, %r9 orq %rax, %rdx shlq $10, %rbx shrq $8, %rcx orq %rbx, %r9 shrq $16, %rsi andl $67108863, %edx andl $67108863, %ecx andl $67108863, %r9d movl %r15d, 120(%r10) movl %edx, 124(%r10) movl %ecx, 128(%r10) movl %r9d, 132(%r10) movl %esi, 136(%r10) poly1305_init_ext_avx2_7: movq $0, 176(%r10) vzeroupper popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 ret FN_END poly1305_init_ext_avx2