From 06a8ad2bae9e0aa0fe62e6059198bb3ec57eb08f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 7 Feb 2015 22:10:07 +0000 Subject: Use optimized version of poly1305. --- src/libcryptobox/poly1305/avx2.S | 1093 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1093 insertions(+) create mode 100644 src/libcryptobox/poly1305/avx2.S (limited to 'src/libcryptobox/poly1305/avx2.S') diff --git a/src/libcryptobox/poly1305/avx2.S b/src/libcryptobox/poly1305/avx2.S new file mode 100644 index 000000000..068e24d3d --- /dev/null +++ b/src/libcryptobox/poly1305/avx2.S @@ -0,0 +1,1093 @@ +#include "../chacha20/macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN_EXT poly1305_block_size_avx2, 0, 0 +movl $64, %eax +ret +FN_END poly1305_block_size_avx2 + +GLOBAL_HIDDEN_FN poly1305_auth_avx2 +cmp $128, %rdx +jb poly1305_auth_x86_local +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +pushq %r12 +pushq %r14 +pushq %r15 +pushq %rbx +subq $224, %rsp +movq %rsi, %r14 +movq %rdi, %rbx +lea (%rsp), %rdi +movq %rcx, %rsi +movq %rdx, %r12 +call poly1305_init_ext_avx2_local +poly1305_auth_avx2_2: +movq %r12, %r15 +andq $-64, %r15 +je poly1305_auth_avx2_5 +poly1305_auth_avx2_3: +movq %r14, %rsi +lea (%rsp), %rdi +movq %r15, %rdx +call poly1305_blocks_avx2_local +poly1305_auth_avx2_4: +addq %r15, %r14 +subq %r15, %r12 +poly1305_auth_avx2_5: +movq %r14, %rsi +lea (%rsp), %rdi +movq %r12, %rdx +movq %rbx, %rcx +call poly1305_finish_ext_avx2_local +poly1305_auth_avx2_6: +addq $224, %rsp +popq %rbx +popq %r15 +popq %r14 +popq %r12 +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_auth_avx2 + + +GLOBAL_HIDDEN_FN poly1305_finish_ext_avx2 +poly1305_finish_ext_avx2_local: +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +pushq %r12 +pushq %r13 +pushq %r14 +subq $104, %rsp +movq %rdx, %r13 +movq %rcx, %r14 +movq %rdi, %r12 +testq %r13, %r13 +je poly1305_finish_ext_avx2_29 +poly1305_finish_ext_avx2_2: +lea (%rsp), %rax +vpxor %ymm0, %ymm0, %ymm0 +subq %rax, %rsi +vmovdqu %ymm0, (%rsp) +vmovdqu %ymm0, 32(%rsp) +testq $32, %r13 +je poly1305_finish_ext_avx2_4 +poly1305_finish_ext_avx2_3: +vmovdqu (%rsp,%rsi), %ymm0 +lea 32(%rsp), %rax +vmovdqu %ymm0, (%rsp) +poly1305_finish_ext_avx2_4: +testq $16, %r13 +je poly1305_finish_ext_avx2_6 +poly1305_finish_ext_avx2_5: +vmovdqu (%rax,%rsi), %xmm0 +vmovdqu %xmm0, (%rax) +addq $16, %rax +poly1305_finish_ext_avx2_6: +testq $8, %r13 +je poly1305_finish_ext_avx2_8 +poly1305_finish_ext_avx2_7: +movq (%rax,%rsi), %rdx +movq %rdx, (%rax) +addq $8, %rax +poly1305_finish_ext_avx2_8: +testq $4, %r13 +je poly1305_finish_ext_avx2_10 +poly1305_finish_ext_avx2_9: +movl (%rax,%rsi), %edx +movl %edx, (%rax) +addq $4, %rax +poly1305_finish_ext_avx2_10: +testq $2, %r13 +je poly1305_finish_ext_avx2_12 +poly1305_finish_ext_avx2_11: +movzwl (%rax,%rsi), %edx +movw %dx, (%rax) +addq $2, %rax +poly1305_finish_ext_avx2_12: +testq $1, %r13 +je poly1305_finish_ext_avx2_14 +poly1305_finish_ext_avx2_13: +movb (%rax,%rsi), %dl +movb %dl, (%rax) +poly1305_finish_ext_avx2_14: +testq $15, %r13 +je poly1305_finish_ext_avx2_16 +poly1305_finish_ext_avx2_15: +movb $1, (%rsp,%r13) +poly1305_finish_ext_avx2_16: +movq 176(%r12), %rdx +andq $-8125, %rdx +cmpq $48, %r13 +jb poly1305_finish_ext_avx2_18 +poly1305_finish_ext_avx2_17: +orq $4, %rdx +jmp poly1305_finish_ext_avx2_21 +poly1305_finish_ext_avx2_18: +cmpq $32, %r13 +jb poly1305_finish_ext_avx2_20 +poly1305_finish_ext_avx2_19: +orq $8, %rdx +jmp poly1305_finish_ext_avx2_21 +poly1305_finish_ext_avx2_20: +movq %rdx, %rax +orq $32, %rdx +orq $16, %rax +cmpq $16, %r13 +cmovae %rax, %rdx +poly1305_finish_ext_avx2_21: +testq $1, %rdx +je poly1305_finish_ext_avx2_27 +poly1305_finish_ext_avx2_22: +cmpq $16, %r13 +ja poly1305_finish_ext_avx2_24 +poly1305_finish_ext_avx2_23: +orq $256, %rdx +movq %rdx, 176(%r12) +jmp poly1305_finish_ext_avx2_28 +poly1305_finish_ext_avx2_24: +cmpq $32, %r13 +ja poly1305_finish_ext_avx2_27 +poly1305_finish_ext_avx2_25: +orq $128, %rdx +movq %rdx, 176(%r12) +jmp poly1305_finish_ext_avx2_28 +poly1305_finish_ext_avx2_27: +movq %rdx, 176(%r12) +poly1305_finish_ext_avx2_28: +movq %r12, %rdi +lea (%rsp), %rsi +movl $64, %edx +vzeroupper +call poly1305_blocks_avx2_local +poly1305_finish_ext_avx2_29: +movq 176(%r12), %rdx +testq $1, %rdx +je poly1305_finish_ext_avx2_37 +poly1305_finish_ext_avx2_30: +andq $-8125, %rdx +testq %r13, %r13 +je poly1305_finish_ext_avx2_32 +poly1305_finish_ext_avx2_31: +cmpq $48, %r13 +jbe poly1305_finish_ext_avx2_33 +poly1305_finish_ext_avx2_32: +orq $512, %rdx +jmp poly1305_finish_ext_avx2_36 +poly1305_finish_ext_avx2_33: +cmpq $32, %r13 +jbe poly1305_finish_ext_avx2_35 +poly1305_finish_ext_avx2_34: +orq $1024, %rdx +jmp poly1305_finish_ext_avx2_36 +poly1305_finish_ext_avx2_35: +movq %rdx, %rax +orq $4096, %rdx +orq $2048, %rax +cmpq $16, %r13 +cmova %rax, %rdx +poly1305_finish_ext_avx2_36: +orq $96, %rdx +movq %r12, %rdi +vpxor %ymm0, %ymm0, %ymm0 +lea (%rsp), %rsi +movq %rdx, 176(%r12) +movl $64, %edx +vmovdqu %ymm0, (%rsp) +vmovdqu %ymm0, 32(%rsp) +vzeroupper +call poly1305_blocks_avx2_local +poly1305_finish_ext_avx2_37: +movq 8(%r12), %r8 +movq %r8, %rsi +movq 16(%r12), %rax +vpxor %ymm0, %ymm0, %ymm0 +shlq $44, %rsi +shrq $20, %r8 +shlq $24, %rax +orq (%r12), %rsi +orq %rax, %r8 +movq 160(%r12), %rdx +movq 168(%r12), %rcx +addq %rdx, %rsi +adcq %rcx, %r8 +vmovdqu %ymm0, (%r12) +vmovdqu %ymm0, 32(%r12) +vmovdqu %ymm0, 64(%r12) +vmovdqu %ymm0, 96(%r12) +vmovdqu %ymm0, 128(%r12) +vmovdqu %ymm0, 160(%r12) +movq %rsi, (%r14) +movq %r8, 8(%r14) +vzeroupper +addq $104, %rsp +popq %r14 +popq %r13 +popq %r12 +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_finish_ext_avx2 + +GLOBAL_HIDDEN_FN poly1305_blocks_avx2 +poly1305_blocks_avx2_local: +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +subq $384, %rsp +movl $16777216, %eax +movl $67108863, %ecx +movl $5, %r8d +vmovd %eax, %xmm1 +vmovd %ecx, %xmm10 +vmovd %r8d, %xmm0 +movq 176(%rdi), %rax +vpbroadcastq %xmm1, %ymm1 +vpbroadcastq %xmm10, %ymm10 +vpbroadcastq %xmm0, %ymm11 +testq $60, %rax +je poly1305_blocks_avx2_11 +poly1305_blocks_avx2_2: +vpsrldq $8, %ymm1, %ymm15 +testq $4, %rax +je poly1305_blocks_avx2_4 +poly1305_blocks_avx2_3: +vpermq $192, %ymm15, %ymm15 +poly1305_blocks_avx2_4: +testq $8, %rax +je poly1305_blocks_avx2_6 +poly1305_blocks_avx2_5: +vpermq $240, %ymm15, %ymm15 +poly1305_blocks_avx2_6: +testq $16, %rax +je poly1305_blocks_avx2_8 +poly1305_blocks_avx2_7: +vpermq $252, %ymm15, %ymm15 +poly1305_blocks_avx2_8: +testq $32, %rax +je poly1305_blocks_avx2_10 +poly1305_blocks_avx2_9: +vpxor %ymm15, %ymm15, %ymm15 +poly1305_blocks_avx2_10: +vmovdqa %ymm15, %ymm1 +poly1305_blocks_avx2_11: +movq %rax, %rcx +btsq $0, %rcx +jc poly1305_blocks_avx2_13 +poly1305_blocks_avx2_12: +vmovdqu (%rsi), %ymm3 +movq %rcx, %rax +vmovdqu 32(%rsi), %ymm5 +vpunpcklqdq %ymm5, %ymm3, %ymm4 +addq $64, %rsi +vpunpckhqdq %ymm5, %ymm3, %ymm7 +vpermq $216, %ymm4, %ymm6 +addq $-64, %rdx +vpermq $216, %ymm7, %ymm0 +vpsrlq $52, %ymm6, %ymm8 +vpsllq $12, %ymm0, %ymm9 +vpsrlq $26, %ymm6, %ymm2 +vpsrlq $40, %ymm0, %ymm0 +vpand %ymm6, %ymm10, %ymm4 +vpor %ymm9, %ymm8, %ymm7 +vpand %ymm2, %ymm10, %ymm3 +vpor %ymm1, %ymm0, %ymm9 +vpsrlq $26, %ymm7, %ymm2 +vpand %ymm7, %ymm10, %ymm5 +vpand %ymm2, %ymm10, %ymm7 +movq %rax, 176(%rdi) +jmp poly1305_blocks_avx2_14 +poly1305_blocks_avx2_13: +vpermq $216, (%rdi), %ymm15 +vpxor %ymm0, %ymm0, %ymm0 +vpermq $216, 32(%rdi), %ymm14 +vpermq $216, 64(%rdi), %ymm13 +vpunpckldq %ymm0, %ymm15, %ymm4 +vpunpckhdq %ymm0, %ymm15, %ymm3 +vpunpckldq %ymm0, %ymm14, %ymm5 +vpunpckhdq %ymm0, %ymm14, %ymm7 +vpunpckldq %ymm0, %ymm13, %ymm9 +poly1305_blocks_avx2_14: +cmpq $64, %rdx +jb poly1305_blocks_avx2_34 +poly1305_blocks_avx2_15: +vmovdqu 140(%rdi), %ymm0 +testq $8064, %rax +je poly1305_blocks_avx2_29 +poly1305_blocks_avx2_16: +vpermq $216, 80(%rdi), %ymm6 +vpermq $216, 100(%rdi), %ymm2 +vpermq $216, 120(%rdi), %ymm8 +vpermq $216, %ymm0, %ymm0 +testq $128, %rax +je poly1305_blocks_avx2_18 +poly1305_blocks_avx2_17: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm0, %ymm14 +vmovdqa %ymm0, %ymm13 +vmovdqa %ymm8, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_18: +testq $256, %rax +je poly1305_blocks_avx2_20 +poly1305_blocks_avx2_19: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm0, %ymm14 +vmovdqa %ymm8, %ymm13 +vmovdqa %ymm2, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_20: +testq $512, %rax +je poly1305_blocks_avx2_22 +poly1305_blocks_avx2_21: +vmovdqa %ymm0, %ymm15 +vmovdqa %ymm8, %ymm14 +vmovdqa %ymm2, %ymm13 +vmovdqa %ymm6, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_22: +testq $1024, %rax +je poly1305_blocks_avx2_24 +poly1305_blocks_avx2_23: +vpxor %ymm12, %ymm12, %ymm12 +movl $1, %r8d +vmovdqa %ymm8, %ymm15 +vmovdqa %ymm2, %ymm14 +vmovdqa %ymm6, %ymm13 +vmovd %r8d, %xmm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_24: +testq $2048, %rax +je poly1305_blocks_avx2_26 +poly1305_blocks_avx2_25: +vpxor %ymm12, %ymm12, %ymm12 +movl $1, %r8d +vmovd %r8d, %xmm13 +vmovdqa %ymm2, %ymm15 +vmovdqa %ymm6, %ymm14 +vmovdqa %ymm13, %ymm12 +jmp poly1305_blocks_avx2_28 +poly1305_blocks_avx2_26: +testq $4096, %rax +je poly1305_blocks_avx2_28 +poly1305_blocks_avx2_27: +movl $1, %r8d +vmovd %r8d, %xmm14 +vmovdqa %ymm6, %ymm15 +vmovdqa %ymm14, %ymm13 +vmovdqa %ymm14, %ymm12 +poly1305_blocks_avx2_28: +vpunpcklqdq %ymm14, %ymm15, %ymm6 +vpunpcklqdq %ymm12, %ymm13, %ymm8 +vpunpckhqdq %ymm14, %ymm15, %ymm14 +vpunpckhqdq %ymm12, %ymm13, %ymm12 +vperm2i128 $32, %ymm8, %ymm6, %ymm2 +vperm2i128 $49, %ymm8, %ymm6, %ymm6 +vpsrlq $32, %ymm6, %ymm0 +vpsrlq $32, %ymm2, %ymm8 +vmovdqu %ymm0, 352(%rsp) +vperm2i128 $32, %ymm12, %ymm14, %ymm13 +vmovdqu %ymm13, 320(%rsp) +jmp poly1305_blocks_avx2_30 +poly1305_blocks_avx2_29: +vpsrlq $32, %ymm0, %ymm12 +vpermq $0, %ymm0, %ymm2 +vpermq $85, %ymm0, %ymm6 +vpermq $85, %ymm12, %ymm13 +vpermq $170, %ymm0, %ymm0 +vpermq $0, %ymm12, %ymm8 +vmovdqu %ymm13, 352(%rsp) +vmovdqu %ymm0, 320(%rsp) +poly1305_blocks_avx2_30: +vmovdqu (%rsi), %ymm12 +movq %rdx, %r9 +vmovdqu 352(%rsp), %ymm15 +vmovdqu %ymm1, 160(%rsp) +vmovdqu %ymm10, 192(%rsp) +vmovdqu %ymm11, 128(%rsp) +vperm2i128 $32, 32(%rsi), %ymm12, %ymm13 +xorl %r8d, %r8d +vperm2i128 $49, 32(%rsi), %ymm12, %ymm12 +xorl %ecx, %ecx +vpmuludq %ymm11, %ymm8, %ymm0 +vpmuludq %ymm11, %ymm6, %ymm1 +vmovdqu %ymm0, 224(%rsp) +vmovdqu %ymm1, 256(%rsp) +vpunpckldq %ymm12, %ymm13, %ymm14 +vpunpckhdq %ymm12, %ymm13, %ymm12 +vmovdqu %ymm14, 32(%rsp) +vpmuludq %ymm0, %ymm9, %ymm0 +vpmuludq %ymm1, %ymm7, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vpmuludq %ymm11, %ymm15, %ymm10 +vpmuludq %ymm10, %ymm5, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vmovdqu %ymm10, 288(%rsp) +vpmuludq 320(%rsp), %ymm11, %ymm11 +vpmuludq %ymm11, %ymm3, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vmovdqu %ymm11, (%rsp) +vpmuludq %ymm2, %ymm4, %ymm13 +vpaddq %ymm13, %ymm0, %ymm0 +vpxor %ymm13, %ymm13, %ymm13 +vpunpckldq %ymm13, %ymm14, %ymm14 +vpaddq %ymm14, %ymm0, %ymm0 +vmovdqu %ymm0, 64(%rsp) +vpmuludq %ymm11, %ymm9, %ymm14 +vpmuludq %ymm2, %ymm7, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm8, %ymm5, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm6, %ymm3, %ymm0 +vpaddq %ymm0, %ymm14, %ymm14 +vpmuludq %ymm15, %ymm4, %ymm0 +vpaddq %ymm0, %ymm14, %ymm0 +vpunpckhdq %ymm13, %ymm12, %ymm14 +vpsllq $18, %ymm14, %ymm14 +vpaddq %ymm14, %ymm0, %ymm14 +vpmuludq %ymm1, %ymm9, %ymm1 +vpmuludq %ymm10, %ymm7, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm11, %ymm5, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm2, %ymm3, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm8, %ymm4, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu 32(%rsp), %ymm0 +vpunpckhdq %ymm13, %ymm0, %ymm0 +vpsllq $6, %ymm0, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu 64(%rsp), %ymm0 +vpsrlq $26, %ymm0, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vmovdqu %ymm1, 96(%rsp) +vpmuludq %ymm2, %ymm9, %ymm1 +vpmuludq %ymm8, %ymm7, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm10, %ymm9, %ymm10 +vpmuludq %ymm11, %ymm7, %ymm11 +vpaddq %ymm11, %ymm10, %ymm7 +vpmuludq %ymm6, %ymm5, %ymm0 +vpaddq %ymm0, %ymm1, %ymm1 +vpmuludq %ymm2, %ymm5, %ymm5 +vpaddq %ymm5, %ymm7, %ymm10 +vpmuludq %ymm15, %ymm3, %ymm15 +vpaddq %ymm15, %ymm1, %ymm1 +vpmuludq %ymm8, %ymm3, %ymm11 +vpaddq %ymm11, %ymm10, %ymm5 +vpunpckldq %ymm13, %ymm12, %ymm10 +vmovdqu 96(%rsp), %ymm12 +vpmuludq 320(%rsp), %ymm4, %ymm0 +vpaddq %ymm0, %ymm1, %ymm15 +vpsrlq $26, %ymm12, %ymm3 +vmovdqu 160(%rsp), %ymm1 +vpmuludq %ymm6, %ymm4, %ymm4 +vpaddq %ymm1, %ymm15, %ymm0 +vpsrlq $26, %ymm14, %ymm15 +vpaddq %ymm4, %ymm5, %ymm11 +vpsllq $12, %ymm10, %ymm4 +vmovdqu 192(%rsp), %ymm10 +vpaddq %ymm15, %ymm0, %ymm0 +vpaddq %ymm4, %ymm11, %ymm5 +vmovdqu 128(%rsp), %ymm11 +vpsrlq $26, %ymm0, %ymm9 +vpaddq %ymm3, %ymm5, %ymm7 +vpand 64(%rsp), %ymm10, %ymm13 +vpand %ymm10, %ymm12, %ymm12 +vpand %ymm10, %ymm7, %ymm5 +vpsrlq $26, %ymm7, %ymm7 +vpmuludq %ymm11, %ymm9, %ymm15 +vpand %ymm10, %ymm14, %ymm9 +vpaddq %ymm15, %ymm13, %ymm3 +vpand %ymm10, %ymm0, %ymm14 +vpaddq %ymm7, %ymm9, %ymm9 +vpand %ymm10, %ymm3, %ymm4 +vpsrlq $26, %ymm3, %ymm3 +vpsrlq $26, %ymm9, %ymm0 +vpand %ymm10, %ymm9, %ymm7 +vpaddq %ymm3, %ymm12, %ymm3 +vpaddq %ymm0, %ymm14, %ymm9 +sarq $5, %r9 +shrq $58, %r9 +addq %rdx, %r9 +sarq $6, %r9 +cmpq $2, %r9 +jl poly1305_blocks_avx2_34 +poly1305_blocks_avx2_31: +vmovdqu %ymm6, 32(%rsp) +lea -64(%rdx), %r9 +vmovdqu %ymm8, 64(%rsp) +vmovdqu %ymm11, 128(%rsp) +vmovdqu %ymm10, 192(%rsp) +vmovdqu %ymm1, 160(%rsp) +vmovdqu (%rsp), %ymm12 +sarq $5, %r9 +shrq $58, %r9 +lea -64(%rdx,%r9), %rdx +sarq $6, %rdx +poly1305_blocks_avx2_32: +vmovdqu 256(%rsp), %ymm15 +incq %r8 +vmovdqu 64(%rcx,%rsi), %ymm11 +vpmuludq 224(%rsp), %ymm9, %ymm8 +vpmuludq %ymm15, %ymm7, %ymm14 +vpaddq %ymm14, %ymm8, %ymm1 +vmovdqu 288(%rsp), %ymm8 +vperm2i128 $32, 96(%rcx,%rsi), %ymm11, %ymm10 +vperm2i128 $49, 96(%rcx,%rsi), %ymm11, %ymm6 +addq $64, %rcx +vpmuludq %ymm8, %ymm5, %ymm13 +vpunpckldq %ymm6, %ymm10, %ymm0 +vpunpckhdq %ymm6, %ymm10, %ymm11 +vpaddq %ymm13, %ymm1, %ymm10 +vpmuludq %ymm12, %ymm3, %ymm6 +vpaddq %ymm6, %ymm10, %ymm14 +vpxor %ymm10, %ymm10, %ymm10 +vpunpckldq %ymm10, %ymm0, %ymm6 +vpunpckhdq %ymm10, %ymm0, %ymm0 +vpmuludq %ymm2, %ymm4, %ymm1 +vpaddq %ymm1, %ymm14, %ymm13 +vpaddq %ymm6, %ymm13, %ymm1 +vmovdqu 64(%rsp), %ymm6 +vmovdqu %ymm1, (%rsp) +vpsrlq $26, %ymm1, %ymm1 +vpmuludq %ymm12, %ymm9, %ymm14 +vpmuludq %ymm2, %ymm7, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq %ymm6, %ymm5, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq 32(%rsp), %ymm3, %ymm13 +vpaddq %ymm13, %ymm14, %ymm14 +vpmuludq 352(%rsp), %ymm4, %ymm13 +vpaddq %ymm13, %ymm14, %ymm13 +vpunpckhdq %ymm10, %ymm11, %ymm14 +vpsllq $18, %ymm14, %ymm14 +vpaddq %ymm14, %ymm13, %ymm13 +vpmuludq %ymm15, %ymm9, %ymm15 +vpmuludq %ymm8, %ymm7, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm12, %ymm5, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm2, %ymm3, %ymm14 +vpaddq %ymm14, %ymm15, %ymm15 +vpmuludq %ymm6, %ymm4, %ymm14 +vpaddq %ymm14, %ymm15, %ymm14 +vpsllq $6, %ymm0, %ymm15 +vpaddq %ymm15, %ymm14, %ymm14 +vmovdqu 32(%rsp), %ymm15 +vpaddq %ymm1, %ymm14, %ymm1 +vpmuludq %ymm2, %ymm9, %ymm0 +vpmuludq %ymm6, %ymm7, %ymm14 +vpmuludq %ymm8, %ymm9, %ymm9 +vpmuludq %ymm12, %ymm7, %ymm7 +vpaddq %ymm7, %ymm9, %ymm7 +vpaddq %ymm14, %ymm0, %ymm0 +vpsrlq $26, %ymm1, %ymm9 +vpmuludq %ymm15, %ymm5, %ymm14 +vpmuludq %ymm2, %ymm5, %ymm5 +vpaddq %ymm5, %ymm7, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpmuludq 352(%rsp), %ymm3, %ymm14 +vpmuludq %ymm6, %ymm3, %ymm3 +vpaddq %ymm3, %ymm5, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpmuludq 320(%rsp), %ymm4, %ymm14 +vpmuludq %ymm15, %ymm4, %ymm4 +vpaddq %ymm4, %ymm5, %ymm5 +vpaddq %ymm14, %ymm0, %ymm0 +vpunpckldq %ymm10, %ymm11, %ymm4 +vpaddq 160(%rsp), %ymm0, %ymm14 +vpsrlq $26, %ymm13, %ymm0 +vpsllq $12, %ymm4, %ymm3 +vpaddq %ymm0, %ymm14, %ymm14 +vpaddq %ymm3, %ymm5, %ymm7 +vpsrlq $26, %ymm14, %ymm0 +vpaddq %ymm9, %ymm7, %ymm10 +vmovdqu 192(%rsp), %ymm9 +vpsrlq $26, %ymm10, %ymm11 +vpand (%rsp), %ymm9, %ymm6 +vpand %ymm9, %ymm13, %ymm13 +vpand %ymm9, %ymm1, %ymm1 +vpand %ymm9, %ymm14, %ymm14 +vpand %ymm9, %ymm10, %ymm5 +vpmuludq 128(%rsp), %ymm0, %ymm8 +vpaddq %ymm8, %ymm6, %ymm15 +vpaddq %ymm11, %ymm13, %ymm0 +vpsrlq $26, %ymm15, %ymm3 +vpand %ymm9, %ymm0, %ymm7 +vpsrlq $26, %ymm0, %ymm0 +vpand %ymm9, %ymm15, %ymm4 +vpaddq %ymm3, %ymm1, %ymm3 +vpaddq %ymm0, %ymm14, %ymm9 +cmpq %rdx, %r8 +jb poly1305_blocks_avx2_32 +poly1305_blocks_avx2_34: +testq $64, %rax +jne poly1305_blocks_avx2_36 +poly1305_blocks_avx2_35: +vpshufd $8, %ymm4, %ymm0 +vpshufd $8, %ymm3, %ymm3 +vpshufd $8, %ymm5, %ymm5 +vpshufd $8, %ymm7, %ymm7 +vpshufd $8, %ymm9, %ymm9 +vpermq $8, %ymm0, %ymm1 +vpermq $8, %ymm3, %ymm2 +vpermq $8, %ymm5, %ymm4 +vpermq $8, %ymm7, %ymm6 +vpermq $8, %ymm9, %ymm11 +vperm2i128 $32, %ymm2, %ymm1, %ymm8 +vperm2i128 $32, %ymm6, %ymm4, %ymm10 +vmovdqu %ymm8, (%rdi) +vmovdqu %ymm10, 32(%rdi) +vmovdqu %xmm11, 64(%rdi) +jmp poly1305_blocks_avx2_37 +poly1305_blocks_avx2_36: +vpermq $245, %ymm4, %ymm0 +vpaddq %ymm0, %ymm4, %ymm4 +vpermq $245, %ymm3, %ymm1 +vpaddq %ymm1, %ymm3, %ymm10 +vpermq $245, %ymm5, %ymm3 +vpermq $170, %ymm4, %ymm6 +vpaddq %ymm3, %ymm5, %ymm13 +vpaddq %ymm6, %ymm4, %ymm8 +vpermq $170, %ymm10, %ymm11 +vpermq $245, %ymm7, %ymm5 +vpaddq %ymm11, %ymm10, %ymm12 +vpaddq %ymm5, %ymm7, %ymm7 +vpermq $170, %ymm13, %ymm14 +vpermq $245, %ymm9, %ymm2 +vpaddq %ymm14, %ymm13, %ymm15 +vpaddq %ymm2, %ymm9, %ymm9 +vpermq $170, %ymm7, %ymm0 +vpaddq %ymm0, %ymm7, %ymm1 +vpermq $170, %ymm9, %ymm2 +vpaddq %ymm2, %ymm9, %ymm3 +vmovd %xmm8, %r9d +movl %r9d, %r8d +shrl $26, %r8d +andq $67108863, %r9 +vmovd %xmm12, %esi +addl %r8d, %esi +movl %esi, %r11d +shrl $26, %esi +andq $67108863, %r11 +vmovd %xmm15, %ecx +addl %esi, %ecx +movl %ecx, %eax +shrl $26, %eax +andq $67108863, %rcx +shlq $8, %rcx +vmovd %xmm1, %r8d +addl %eax, %r8d +movl %r8d, %r10d +shrl $26, %r8d +andq $67108863, %r10 +movq %r10, %rax +shrq $10, %rax +shlq $34, %r10 +vmovd %xmm3, %edx +addl %r8d, %edx +shlq $16, %rdx +orq %rdx, %rax +movq %rax, %r8 +shrq $42, %r8 +lea (%r8,%r8,4), %rdx +movq %r11, %r8 +shlq $26, %r8 +orq %r8, %r9 +movq $0xfffffffffff, %r8 +shrq $18, %r11 +andq %r8, %r9 +addq %r9, %rdx +orq %rcx, %r11 +movq %rdx, %rsi +orq %r10, %r11 +shrq $44, %rsi +andq %r8, %r11 +addq %r11, %rsi +movq $0x3ffffffffff, %r9 +movq %rsi, %r10 +andq %r9, %rax +shrq $44, %r10 +andq %r8, %rdx +addq %r10, %rax +movq %r8, %rcx +andq %rax, %r9 +andq %r8, %rsi +shrq $42, %rax +movq $0xfffffc0000000000, %r10 +lea (%rax,%rax,4), %r11 +addq %r11, %rdx +andq %rdx, %rcx +shrq $44, %rdx +addq %rdx, %rsi +lea 5(%rcx), %rdx +movq %rdx, %r11 +andq %r8, %rdx +shrq $44, %r11 +addq %rsi, %r11 +movq %r11, %rax +andq %r11, %r8 +shrq $44, %rax +addq %r9, %rax +addq %r10, %rax +movq %rax, %r10 +shrq $63, %r10 +decq %r10 +andn %rcx, %r10, %rcx +andq %r10, %rdx +orq %rdx, %rcx +andq %r10, %r8 +andn %rsi, %r10, %rdx +andq %r10, %rax +andn %r9, %r10, %rsi +orq %r8, %rdx +orq %rax, %rsi +movq %rcx, (%rdi) +movq %rdx, 8(%rdi) +movq %rsi, 16(%rdi) +poly1305_blocks_avx2_37: +vzeroupper +movq %rbp, %rsp +popq %rbp +ret +FN_END poly1305_blocks_avx2 + +GLOBAL_HIDDEN_FN poly1305_init_ext_avx2 +poly1305_init_ext_avx2_local: +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +pushq %rbx +movq %rdi, %r10 +vpxor %ymm0, %ymm0, %ymm0 +movq %rdx, %r12 +vpxor %xmm1, %xmm1, %xmm1 +vmovdqu %xmm1, 64(%r10) +vmovdqu %ymm0, (%r10) +vmovdqu %ymm0, 32(%r10) +movq $-1, %r8 +testq %r12, %r12 +movq 8(%rsi), %rdi +movq $0xffc0fffffff, %r9 +movq %rdi, %rcx +cmove %r8, %r12 +movq (%rsi), %r8 +andq %r8, %r9 +shrq $44, %r8 +movq $0xfffffc0ffff, %r11 +shlq $20, %rcx +shrq $24, %rdi +orq %rcx, %r8 +movq $0xffffffc0f, %rcx +andq %r11, %r8 +andq %rcx, %rdi +movq 16(%rsi), %rcx +movq %rcx, 160(%r10) +movq %r9, %rcx +movq 24(%rsi), %rdx +movq %rdx, 168(%r10) +movl %r9d, %edx +andl $67108863, %edx +movl %edx, 80(%r10) +movq %r8, %rdx +shrq $26, %rcx +shlq $18, %rdx +orq %rdx, %rcx +movq %r8, %rdx +shrq $8, %rdx +andl $67108863, %ecx +andl $67108863, %edx +movl %ecx, 84(%r10) +movq %r8, %rcx +movl %edx, 88(%r10) +movq %rdi, %rdx +shrq $34, %rcx +shlq $10, %rdx +orq %rdx, %rcx +movq %rdi, %rdx +shrq $16, %rdx +andl $67108863, %ecx +movl %ecx, 92(%r10) +movl %edx, 96(%r10) +cmpq $16, %r12 +jbe poly1305_init_ext_avx2_7 +poly1305_init_ext_avx2_2: +movq %r9, %rax +lea (%rdi,%rdi,4), %r14 +mulq %r9 +shlq $2, %r14 +movq %rax, %r11 +movq %rdx, %r15 +lea (%r8,%r8), %rax +mulq %r14 +addq %rax, %r11 +lea (%r9,%r9), %rax +movq %r11, %rsi +adcq %rdx, %r15 +mulq %r8 +movq %rax, %rbx +movq %r14, %rax +movq %rdx, %rcx +lea (%rdi,%rdi), %r14 +mulq %rdi +addq %rax, %rbx +movq %r8, %rax +adcq %rdx, %rcx +mulq %r8 +shlq $20, %r15 +movq %rax, %r13 +shrq $44, %rsi +movq %r9, %rax +orq %rsi, %r15 +movq %rdx, %rsi +mulq %r14 +addq %r15, %rbx +movq %rbx, %r15 +adcq $0, %rcx +addq %rax, %r13 +adcq %rdx, %rsi +shlq $20, %rcx +shrq $44, %r15 +orq %r15, %rcx +addq %rcx, %r13 +movq $0xfffffffffff, %rcx +movq %r13, %rdx +adcq $0, %rsi +andq %rcx, %r11 +shlq $22, %rsi +andq %rcx, %rbx +shrq $42, %rdx +orq %rdx, %rsi +lea (%rsi,%rsi,4), %rsi +addq %rsi, %r11 +movq %rcx, %rsi +andq %r11, %rsi +shrq $44, %r11 +addq %r11, %rbx +movq $0x3ffffffffff, %r11 +andq %rbx, %rcx +andq %r11, %r13 +shrq $44, %rbx +movq %rsi, %r11 +movq %rcx, %rdx +addq %r13, %rbx +shrq $26, %r11 +movq %rbx, %r15 +shlq $18, %rdx +movq %rcx, %r14 +orq %rdx, %r11 +movq %rcx, %rdx +shrq $34, %rdx +movl %esi, %r13d +shlq $10, %r15 +andl $67108863, %r13d +orq %r15, %rdx +andl $67108863, %r11d +shrq $8, %r14 +andl $67108863, %edx +movl %edx, 112(%r10) +movq %rbx, %rdx +shrq $16, %rdx +andl $67108863, %r14d +movl %r13d, 100(%r10) +movl %r11d, 104(%r10) +movl %r14d, 108(%r10) +movl %edx, 116(%r10) +cmpq $48, %r12 +jbe poly1305_init_ext_avx2_4 +poly1305_init_ext_avx2_3: +movq %rsi, %rax +lea (%rbx,%rbx,4), %r15 +mulq %rsi +shlq $2, %r15 +movq %rax, %r13 +movq %rdx, %r12 +lea (%rcx,%rcx), %rax +mulq %r15 +addq %rax, %r13 +lea (%rsi,%rsi), %rax +movq %r15, -16(%rsp) +adcq %rdx, %r12 +mulq %rcx +movq %rax, %r14 +movq %rbx, %rax +movq %rdx, %r11 +mulq %r15 +addq %rax, %r14 +movq %rcx, %rax +movq %r13, %r15 +adcq %rdx, %r11 +mulq %rcx +shlq $20, %r12 +shrq $44, %r15 +orq %r15, %r12 +movq %rax, %r15 +addq %r12, %r14 +movq %rdx, %r12 +movq %rsi, %rax +lea (%rbx,%rbx), %rdx +adcq $0, %r11 +mulq %rdx +addq %rax, %r15 +adcq %rdx, %r12 +movq %r14, %rdx +shlq $20, %r11 +shrq $44, %rdx +orq %rdx, %r11 +addq %r11, %r15 +movq $0xfffffffffff, %r11 +movq %r15, %rdx +adcq $0, %r12 +andq %r11, %r13 +shlq $22, %r12 +andq %r11, %r14 +shrq $42, %rdx +orq %rdx, %r12 +lea (%r12,%r12,4), %r12 +addq %r12, %r13 +movq %r11, %r12 +andq %r13, %r12 +shrq $44, %r13 +addq %r13, %r14 +movq $0x3ffffffffff, %r13 +andq %r14, %r11 +andq %r13, %r15 +shrq $44, %r14 +movq %r11, %rdx +shlq $18, %rdx +addq %r14, %r15 +movl %r12d, %r14d +movq %r11, %r13 +shrq $26, %r12 +andl $67108863, %r14d +orq %rdx, %r12 +movq %r15, %rdx +shrq $34, %r11 +shlq $10, %rdx +andl $67108863, %r12d +orq %rdx, %r11 +shrq $8, %r13 +andl $67108863, %r11d +movl %r11d, 152(%r10) +andl $67108863, %r13d +shrq $16, %r15 +movl %r14d, 140(%r10) +movl %r12d, 144(%r10) +movl %r13d, 148(%r10) +movl %r15d, 156(%r10) +movq -16(%rsp), %r11 +jmp poly1305_init_ext_avx2_6 +poly1305_init_ext_avx2_4: +cmpq $32, %r12 +jbe poly1305_init_ext_avx2_7 +poly1305_init_ext_avx2_5: +lea (%rbx,%rbx,4), %r11 +shlq $2, %r11 +poly1305_init_ext_avx2_6: +movq %r9, %rax +lea (%rcx,%rcx,4), %r13 +mulq %rsi +shlq $2, %r13 +movq %rax, %r14 +movq %rdi, %rax +movq %rdx, %r12 +mulq %r13 +addq %rax, %r14 +movq %r8, %rax +adcq %rdx, %r12 +mulq %r11 +addq %rax, %r14 +movq %r8, %rax +adcq %rdx, %r12 +mulq %rsi +movq %rax, %r15 +movq %r9, %rax +movq %rdx, %r13 +mulq %rcx +addq %rax, %r15 +movq %r11, %rax +movq %r14, %r11 +adcq %rdx, %r13 +mulq %rdi +addq %rax, %r15 +movq %rdi, %rax +adcq %rdx, %r13 +mulq %rsi +shlq $20, %r12 +movq %rax, %rsi +shrq $44, %r11 +movq %r8, %rax +orq %r11, %r12 +movq %rdx, %rdi +mulq %rcx +addq %r12, %r15 +movq %r15, %rcx +adcq $0, %r13 +addq %rax, %rsi +movq %r9, %rax +movq $0xfffffffffff, %r9 +adcq %rdx, %rdi +andq %r9, %r14 +mulq %rbx +addq %rax, %rsi +adcq %rdx, %rdi +movq %r9, %rdx +shlq $20, %r13 +andq %r9, %r15 +shrq $44, %rcx +orq %rcx, %r13 +addq %r13, %rsi +movq %rsi, %rbx +adcq $0, %rdi +shlq $22, %rdi +shrq $42, %rbx +orq %rbx, %rdi +lea (%rdi,%rdi,4), %r8 +addq %r8, %r14 +andq %r14, %rdx +shrq $44, %r14 +addq %r14, %r15 +movq $0x3ffffffffff, %r14 +andq %r15, %r9 +andq %r14, %rsi +shrq $44, %r15 +movq %r9, %rax +addq %r15, %rsi +movl %edx, %r15d +movq %rsi, %rbx +movq %r9, %rcx +shrq $26, %rdx +andl $67108863, %r15d +shlq $18, %rax +shrq $34, %r9 +orq %rax, %rdx +shlq $10, %rbx +shrq $8, %rcx +orq %rbx, %r9 +shrq $16, %rsi +andl $67108863, %edx +andl $67108863, %ecx +andl $67108863, %r9d +movl %r15d, 120(%r10) +movl %edx, 124(%r10) +movl %ecx, 128(%r10) +movl %r9d, 132(%r10) +movl %esi, 136(%r10) +poly1305_init_ext_avx2_7: +movq $0, 176(%r10) +vzeroupper +popq %rbx +popq %r15 +popq %r14 +popq %r13 +popq %r12 +ret +FN_END poly1305_init_ext_avx2 + -- cgit v1.2.3