diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-10-25 22:42:28 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-10-25 22:42:28 +0000 |
commit | c0a9cb5b58a7e6b9308fedc38a88a5c749700a10 (patch) | |
tree | 28d39fb7a200e32518ef94d43bd2f8399c325c1a /src/libcryptobox | |
parent | ebc15f59da42f0b01afb74caf844b8d8005d79c7 (diff) | |
download | rspamd-c0a9cb5b58a7e6b9308fedc38a88a5c749700a10.tar.gz rspamd-c0a9cb5b58a7e6b9308fedc38a88a5c749700a10.zip |
Add optimized version of blake2b from @floodyberry
Diffstat (limited to 'src/libcryptobox')
-rw-r--r-- | src/libcryptobox/CMakeLists.txt | 3 | ||||
-rw-r--r-- | src/libcryptobox/blake2/avx.S | 689 | ||||
-rw-r--r-- | src/libcryptobox/blake2/blake2.c | 13 | ||||
-rw-r--r-- | src/libcryptobox/blake2/constants.S | 30 | ||||
-rw-r--r-- | src/libcryptobox/blake2/ref.c | 2 | ||||
-rw-r--r-- | src/libcryptobox/blake2/x86-32.S | 1080 | ||||
-rw-r--r-- | src/libcryptobox/blake2/x86-64.S | 1754 |
7 files changed, 3558 insertions, 13 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt index 9cb7ecbe7..171a4d0f9 100644 --- a/src/libcryptobox/CMakeLists.txt +++ b/src/libcryptobox/CMakeLists.txt @@ -53,9 +53,11 @@ IF("${ARCH}" STREQUAL "x86_64") SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/curve25519-donna.c) endif() + SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/x86-64.S) ELSEIF("${ARCH}" STREQUAL "i386") SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/ref-32.c) SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/curve25519-donna.c) + SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/x86-32.S) ELSE() SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/ref-32.c) ENDIF() @@ -69,6 +71,7 @@ IF(HAVE_AVX) SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx.S) SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/avx.S ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/avx.c) + SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/avx.S) ENDIF(HAVE_AVX) IF(HAVE_SSE2) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/sse2.S) diff --git a/src/libcryptobox/blake2/avx.S b/src/libcryptobox/blake2/avx.S new file mode 100644 index 000000000..e569f0ba7 --- /dev/null +++ b/src/libcryptobox/blake2/avx.S @@ -0,0 +1,689 @@ +#include "../macro.S" +#include "constants.S" + +SECTION_TEXT + +GLOBAL_HIDDEN_FN_EXT blake2b_blocks_avx, 4, 16 +pushq %rbp +movq %rsp, %rbp +andq $-64, %rsp +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +pushq %rbx +subq $344, %rsp +LOAD_VAR_PIC 48+blake2b_constants, %r9 +LOAD_VAR_PIC blake2b_constants_ssse3, %rax +leaq 16(%rax), %r8 +vmovdqu 80(%rdi), %xmm0 +cmpq $128, %rdx +vpxor (%r9), %xmm0, %xmm0 +movl $128, %r9d +vmovdqu (%rax), %xmm12 +cmovbe %rdx, %r9 +vmovdqu (%r8), %xmm1 +movq 64(%rdi), %r8 +movq 72(%rdi), %rax +cmpq $0, 80(%rdi) +je blake2b_blocks_avx_L21 +blake2b_blocks_avx_L2: +cmpq $128, %rdx +je blake2b_blocks_avx_L21 +blake2b_blocks_avx_L3: +lea (%rsp), %r10 +testq $64, %rdx +je blake2b_blocks_avx_L5 +blake2b_blocks_avx_L4: +vmovdqu (%rsi), %xmm2 +vmovdqu %xmm2, (%rsp) +lea 64(%rsp), %r10 +vmovdqu 16(%rsi), %xmm3 +vmovdqu %xmm3, 16(%rsp) +vpxor %xmm2, %xmm2, %xmm2 +vmovdqu 32(%rsi), %xmm4 +vmovdqu %xmm4, 32(%rsp) +vmovdqu 48(%rsi), %xmm5 +vmovdqu %xmm5, 48(%rsp) +addq $64, %rsi +jmp blake2b_blocks_avx_L6 +blake2b_blocks_avx_L5: +vpxor %xmm2, %xmm2, %xmm2 +vmovdqu %xmm2, 64(%rsp) +vmovdqu %xmm2, 80(%rsp) +vmovdqu %xmm2, 96(%rsp) +vmovdqu %xmm2, 112(%rsp) +blake2b_blocks_avx_L6: +vmovdqu %xmm2, (%r10) +vmovdqu %xmm2, 16(%r10) +vmovdqu %xmm2, 32(%r10) +vmovdqu %xmm2, 48(%r10) +testq $32, %rdx +je blake2b_blocks_avx_L8 +blake2b_blocks_avx_L7: +vmovdqu (%rsi), %xmm2 +vmovdqu %xmm2, (%r10) +vmovdqu 16(%rsi), %xmm3 +vmovdqu %xmm3, 16(%r10) +addq $32, %rsi +addq $32, %r10 +blake2b_blocks_avx_L8: +testq $16, %rdx +je blake2b_blocks_avx_L10 +blake2b_blocks_avx_L9: +vmovdqu (%rsi), %xmm2 +vmovdqu %xmm2, (%r10) +addq $16, %rsi +addq $16, %r10 +blake2b_blocks_avx_L10: +testq $8, %rdx +je blake2b_blocks_avx_L12 +blake2b_blocks_avx_L11: +movq (%rsi), %r11 +addq $8, %rsi +movq %r11, (%r10) +addq $8, %r10 +blake2b_blocks_avx_L12: +testq $4, %rdx +je blake2b_blocks_avx_L14 +blake2b_blocks_avx_L13: +movl (%rsi), %r11d +addq $4, %rsi +movl %r11d, (%r10) +addq $4, %r10 +blake2b_blocks_avx_L14: +testq $2, %rdx +je blake2b_blocks_avx_L16 +blake2b_blocks_avx_L15: +movzwl (%rsi), %r11d +addq $2, %rsi +movw %r11w, (%r10) +addq $2, %r10 +blake2b_blocks_avx_L16: +testq $1, %rdx +je blake2b_blocks_avx_L18 +blake2b_blocks_avx_L17: +movb (%rsi), %sil +movb %sil, (%r10) +blake2b_blocks_avx_L18: +lea (%rsp), %rsi +blake2b_blocks_avx_L21: +LOAD_VAR_PIC 32+blake2b_constants, %r10 +LOAD_VAR_PIC blake2b_constants, %r11 +vmovdqu (%rdi), %xmm5 +vmovdqu 16(%rdi), %xmm6 +vmovdqu 32(%rdi), %xmm7 +vmovdqu (%r10), %xmm4 +LOAD_VAR_PIC 16+blake2b_constants, %r10 +vmovdqu 48(%rdi), %xmm8 +vmovdqu (%r11), %xmm3 +vmovdqu %xmm3, 176(%rsp) +vmovdqu (%r10), %xmm2 +vmovdqu %xmm2, 160(%rsp) +vmovdqu %xmm4, 144(%rsp) +vmovdqu %xmm8, 240(%rsp) +vmovdqu %xmm7, 256(%rsp) +vmovdqu %xmm6, 224(%rsp) +vmovdqu %xmm5, 208(%rsp) +vmovdqu %xmm0, 192(%rsp) +movq %r9, 272(%rsp) +movq %rdi, 128(%rsp) +movq %rcx, 136(%rsp) +jmp blake2b_blocks_avx_L22 +# align to 31 mod 64 +.p2align 6 +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +nop +blake2b_blocks_avx_L25: +addq 136(%rsp), %rsi +addq $-128, %rdx +blake2b_blocks_avx_L22: +movq 272(%rsp), %rcx +addq %rcx, %r8 +cmpq %rcx, %r8 +lea 1(%rax), %rbx +vmovdqu (%rsi), %xmm15 +vmovdqu 16(%rsi), %xmm5 +vmovdqu 32(%rsi), %xmm3 +vmovdqu 48(%rsi), %xmm6 +cmovb %rbx, %rax +vmovd %r8, %xmm7 +vpunpcklqdq %xmm5, %xmm15, %xmm2 +LOAD_VAR_PIC 96+blake2b_constants, %rcx +vpunpcklqdq %xmm6, %xmm3, %xmm8 +LOAD_VAR_PIC 224+blake2b_constants, %rbx +vpaddq 208(%rsp), %xmm2, %xmm0 +vpaddq 224(%rsp), %xmm8, %xmm10 +vmovd %rax, %xmm14 +vmovdqu 256(%rsp), %xmm4 +vmovdqu 240(%rsp), %xmm11 +vpunpcklqdq %xmm14, %xmm7, %xmm9 +vpaddq %xmm4, %xmm0, %xmm13 +vpaddq %xmm11, %xmm10, %xmm2 +vpxor 144(%rsp), %xmm9, %xmm0 +vpxor 192(%rsp), %xmm2, %xmm10 +vpxor %xmm13, %xmm0, %xmm8 +vpshufd $177, %xmm8, %xmm8 +vpshufd $177, %xmm10, %xmm7 +vpaddq 176(%rsp), %xmm8, %xmm14 +vpaddq 160(%rsp), %xmm7, %xmm9 +vpxor %xmm14, %xmm4, %xmm4 +vpxor %xmm9, %xmm11, %xmm11 +vpshufb %xmm1, %xmm4, %xmm4 +vpshufb %xmm1, %xmm11, %xmm0 +vpunpckhqdq %xmm5, %xmm15, %xmm15 +vpunpckhqdq %xmm6, %xmm3, %xmm6 +vpaddq %xmm15, %xmm13, %xmm13 +vpaddq %xmm6, %xmm2, %xmm6 +vpaddq %xmm4, %xmm13, %xmm10 +vpaddq %xmm0, %xmm6, %xmm15 +vpxor %xmm10, %xmm8, %xmm2 +vpxor %xmm15, %xmm7, %xmm8 +vpshufb %xmm12, %xmm2, %xmm5 +vpshufb %xmm12, %xmm8, %xmm2 +vpaddq %xmm5, %xmm14, %xmm6 +vpaddq %xmm2, %xmm9, %xmm7 +vpxor %xmm6, %xmm4, %xmm4 +vpxor %xmm7, %xmm0, %xmm9 +vpaddq %xmm4, %xmm4, %xmm14 +vpaddq %xmm9, %xmm9, %xmm13 +vpsrlq $63, %xmm4, %xmm0 +vpsrlq $63, %xmm9, %xmm11 +vpor %xmm14, %xmm0, %xmm8 +vpor %xmm13, %xmm11, %xmm4 +vpalignr $8, %xmm8, %xmm4, %xmm0 +vpalignr $8, %xmm4, %xmm8, %xmm14 +vmovdqu 64(%rsi), %xmm9 +vmovdqu 80(%rsi), %xmm8 +vmovdqu 96(%rsi), %xmm4 +vpunpcklqdq %xmm8, %xmm9, %xmm11 +vpaddq %xmm11, %xmm10, %xmm10 +vmovdqu 112(%rsi), %xmm11 +vpaddq %xmm0, %xmm10, %xmm13 +vpunpcklqdq %xmm11, %xmm4, %xmm10 +vpaddq %xmm10, %xmm15, %xmm15 +vpaddq %xmm14, %xmm15, %xmm15 +vpalignr $8, %xmm2, %xmm5, %xmm10 +vpalignr $8, %xmm5, %xmm2, %xmm5 +vpxor %xmm13, %xmm10, %xmm10 +vpxor %xmm15, %xmm5, %xmm2 +vpshufd $177, %xmm10, %xmm10 +vpshufd $177, %xmm2, %xmm2 +vpaddq %xmm10, %xmm7, %xmm7 +vpaddq %xmm2, %xmm6, %xmm5 +vpxor %xmm7, %xmm0, %xmm6 +vpxor %xmm5, %xmm14, %xmm14 +vpshufb %xmm1, %xmm6, %xmm0 +vpshufb %xmm1, %xmm14, %xmm6 +vpunpckhqdq %xmm8, %xmm9, %xmm14 +vpaddq %xmm14, %xmm13, %xmm13 +vpaddq %xmm0, %xmm13, %xmm14 +vpunpckhqdq %xmm11, %xmm4, %xmm13 +vpxor %xmm14, %xmm10, %xmm10 +vpaddq %xmm13, %xmm15, %xmm15 +vpshufb %xmm12, %xmm10, %xmm13 +vpaddq %xmm6, %xmm15, %xmm15 +vpaddq %xmm13, %xmm7, %xmm10 +vpxor %xmm15, %xmm2, %xmm2 +vpxor %xmm10, %xmm0, %xmm0 +vpshufb %xmm12, %xmm2, %xmm2 +vpaddq %xmm2, %xmm5, %xmm5 +vpxor %xmm5, %xmm6, %xmm7 +vpsrlq $63, %xmm0, %xmm6 +vpaddq %xmm0, %xmm0, %xmm0 +vpor %xmm0, %xmm6, %xmm6 +vpsrlq $63, %xmm7, %xmm0 +vpaddq %xmm7, %xmm7, %xmm7 +vpor %xmm7, %xmm0, %xmm0 +vpalignr $8, %xmm0, %xmm6, %xmm7 +vpalignr $8, %xmm6, %xmm0, %xmm6 +vpunpcklqdq %xmm3, %xmm11, %xmm0 +vpaddq %xmm0, %xmm14, %xmm14 +vpaddq %xmm7, %xmm14, %xmm0 +vpunpckhqdq %xmm4, %xmm9, %xmm14 +vpaddq %xmm14, %xmm15, %xmm15 +vpaddq %xmm6, %xmm15, %xmm14 +vpalignr $8, %xmm13, %xmm2, %xmm15 +vpxor %xmm0, %xmm15, %xmm15 +vpshufd $177, %xmm15, %xmm15 +vpalignr $8, %xmm2, %xmm13, %xmm2 +vpxor %xmm14, %xmm2, %xmm13 +vpaddq %xmm15, %xmm5, %xmm2 +vpshufd $177, %xmm13, %xmm13 +vpxor %xmm2, %xmm7, %xmm5 +vpunpcklqdq %xmm9, %xmm8, %xmm7 +vpaddq %xmm13, %xmm10, %xmm10 +vpaddq %xmm7, %xmm0, %xmm9 +vmovdqu 48(%rsi), %xmm0 +vpshufb %xmm1, %xmm5, %xmm5 +vpxor %xmm10, %xmm6, %xmm6 +vpshufb %xmm1, %xmm6, %xmm6 +vpaddq %xmm5, %xmm9, %xmm9 +vpalignr $8, %xmm11, %xmm0, %xmm11 +vpxor %xmm9, %xmm15, %xmm15 +vpaddq %xmm11, %xmm14, %xmm7 +vpshufb %xmm12, %xmm15, %xmm11 +vpaddq %xmm6, %xmm7, %xmm14 +vpaddq %xmm11, %xmm2, %xmm2 +vpxor %xmm14, %xmm13, %xmm13 +vpxor %xmm2, %xmm5, %xmm5 +vpshufb %xmm12, %xmm13, %xmm13 +vpaddq %xmm13, %xmm10, %xmm10 +vpxor %xmm10, %xmm6, %xmm15 +vpsrlq $63, %xmm5, %xmm6 +vpaddq %xmm5, %xmm5, %xmm5 +vpsrlq $63, %xmm15, %xmm7 +vpor %xmm5, %xmm6, %xmm6 +vpaddq %xmm15, %xmm15, %xmm15 +vpor %xmm15, %xmm7, %xmm5 +vpalignr $8, %xmm6, %xmm5, %xmm15 +vpalignr $8, %xmm5, %xmm6, %xmm5 +vpshufd $78, (%rsi), %xmm6 +vpaddq %xmm6, %xmm9, %xmm9 +vpunpckhqdq %xmm3, %xmm8, %xmm3 +vpaddq %xmm3, %xmm14, %xmm6 +vpaddq %xmm15, %xmm9, %xmm9 +vpaddq %xmm5, %xmm6, %xmm8 +vpalignr $8, %xmm13, %xmm11, %xmm3 +vpalignr $8, %xmm11, %xmm13, %xmm11 +vpxor %xmm9, %xmm3, %xmm7 +vpshufd $177, %xmm7, %xmm14 +vpxor %xmm8, %xmm11, %xmm13 +vpshufd $177, %xmm13, %xmm3 +vpaddq %xmm14, %xmm10, %xmm6 +vpaddq %xmm3, %xmm2, %xmm10 +vpxor %xmm6, %xmm15, %xmm2 +vmovdqu 16(%rsi), %xmm15 +vpshufb %xmm1, %xmm2, %xmm7 +vpxor %xmm10, %xmm5, %xmm2 +vpshufb %xmm1, %xmm2, %xmm5 +vpunpcklqdq %xmm15, %xmm4, %xmm4 +vpunpckhqdq %xmm15, %xmm0, %xmm0 +vpaddq %xmm4, %xmm9, %xmm2 +vpaddq %xmm0, %xmm8, %xmm8 +vpaddq %xmm7, %xmm2, %xmm2 +vpaddq %xmm5, %xmm8, %xmm0 +vpxor %xmm2, %xmm14, %xmm15 +vpxor %xmm0, %xmm3, %xmm9 +vpshufb %xmm12, %xmm15, %xmm15 +vpshufb %xmm12, %xmm9, %xmm3 +vpaddq %xmm15, %xmm6, %xmm8 +vpaddq %xmm3, %xmm10, %xmm6 +vpxor %xmm8, %xmm7, %xmm10 +vpxor %xmm6, %xmm5, %xmm5 +vpaddq %xmm5, %xmm5, %xmm9 +vpsrlq $63, %xmm10, %xmm4 +vpsrlq $63, %xmm5, %xmm7 +vpaddq %xmm10, %xmm10, %xmm10 +vpor %xmm10, %xmm4, %xmm13 +vpor %xmm9, %xmm7, %xmm11 +vpalignr $8, %xmm11, %xmm13, %xmm4 +vpalignr $8, %xmm13, %xmm11, %xmm7 +vpalignr $8, %xmm15, %xmm3, %xmm9 +vpalignr $8, %xmm3, %xmm15, %xmm10 +blake2b_blocks_avx_L23: +movzbl (%rcx), %edi +movzbl 2(%rcx), %r9d +movzbl 4(%rcx), %r10d +movzbl 6(%rcx), %r11d +vmovq (%rdi,%rsi), %xmm5 +vpinsrq $1, (%r9,%rsi), %xmm5, %xmm14 +vmovq (%r10,%rsi), %xmm3 +vpinsrq $1, (%r11,%rsi), %xmm3, %xmm15 +vpaddq %xmm14, %xmm2, %xmm2 +vpaddq %xmm15, %xmm0, %xmm0 +vpaddq %xmm4, %xmm2, %xmm2 +vpaddq %xmm7, %xmm0, %xmm0 +vpxor %xmm2, %xmm9, %xmm11 +vpxor %xmm0, %xmm10, %xmm10 +vpshufd $177, %xmm11, %xmm3 +movzbl 1(%rcx), %r12d +movzbl 5(%rcx), %r14d +vpshufd $177, %xmm10, %xmm5 +vpaddq %xmm3, %xmm6, %xmm6 +vpaddq %xmm5, %xmm8, %xmm9 +movzbl 3(%rcx), %r13d +vpxor %xmm6, %xmm4, %xmm14 +movzbl 7(%rcx), %r15d +vpxor %xmm9, %xmm7, %xmm15 +vmovq (%r12,%rsi), %xmm4 +vmovq (%r14,%rsi), %xmm11 +vpinsrq $1, (%r13,%rsi), %xmm4, %xmm7 +vpinsrq $1, (%r15,%rsi), %xmm11, %xmm13 +vpshufb %xmm1, %xmm14, %xmm8 +vpshufb %xmm1, %xmm15, %xmm14 +vpaddq %xmm7, %xmm2, %xmm2 +vpaddq %xmm13, %xmm0, %xmm0 +vpaddq %xmm8, %xmm2, %xmm4 +vpaddq %xmm14, %xmm0, %xmm7 +vpxor %xmm4, %xmm3, %xmm10 +vpxor %xmm7, %xmm5, %xmm3 +vpshufb %xmm12, %xmm10, %xmm11 +vpshufb %xmm12, %xmm3, %xmm10 +vpaddq %xmm11, %xmm6, %xmm13 +vpaddq %xmm10, %xmm9, %xmm9 +movzbl 8(%rcx), %edi +vpxor %xmm13, %xmm8, %xmm8 +movzbl 12(%rcx), %r10d +vpxor %xmm9, %xmm14, %xmm2 +movzbl 10(%rcx), %r9d +vpsrlq $63, %xmm8, %xmm6 +movzbl 14(%rcx), %r11d +vpsrlq $63, %xmm2, %xmm0 +vpaddq %xmm8, %xmm8, %xmm5 +vpaddq %xmm2, %xmm2, %xmm14 +vmovq (%rdi,%rsi), %xmm15 +vpor %xmm5, %xmm6, %xmm8 +vmovq (%r10,%rsi), %xmm3 +vpor %xmm14, %xmm0, %xmm6 +vpinsrq $1, (%r9,%rsi), %xmm15, %xmm5 +vpinsrq $1, (%r11,%rsi), %xmm3, %xmm0 +vpalignr $8, %xmm8, %xmm6, %xmm2 +vpalignr $8, %xmm6, %xmm8, %xmm14 +vpalignr $8, %xmm10, %xmm11, %xmm8 +vpalignr $8, %xmm11, %xmm10, %xmm11 +vpaddq %xmm5, %xmm4, %xmm4 +vpaddq %xmm0, %xmm7, %xmm7 +vpaddq %xmm2, %xmm4, %xmm15 +vpaddq %xmm14, %xmm7, %xmm0 +vpxor %xmm15, %xmm8, %xmm6 +vpxor %xmm0, %xmm11, %xmm10 +vpshufd $177, %xmm6, %xmm6 +vpshufd $177, %xmm10, %xmm8 +movzbl 9(%rcx), %r12d +movzbl 13(%rcx), %r14d +vpaddq %xmm6, %xmm9, %xmm4 +vpaddq %xmm8, %xmm13, %xmm7 +movzbl 11(%rcx), %r13d +vpxor %xmm4, %xmm2, %xmm9 +movzbl 15(%rcx), %r15d +vpxor %xmm7, %xmm14, %xmm2 +vmovq (%r12,%rsi), %xmm14 +addq $16, %rcx +vmovq (%r14,%rsi), %xmm3 +vpshufb %xmm1, %xmm9, %xmm13 +vpinsrq $1, (%r13,%rsi), %xmm14, %xmm5 +vpinsrq $1, (%r15,%rsi), %xmm3, %xmm9 +vpshufb %xmm1, %xmm2, %xmm11 +vpaddq %xmm5, %xmm15, %xmm15 +vpaddq %xmm9, %xmm0, %xmm0 +vpaddq %xmm13, %xmm15, %xmm2 +vpaddq %xmm11, %xmm0, %xmm0 +vpxor %xmm2, %xmm6, %xmm6 +vpxor %xmm0, %xmm8, %xmm8 +vpshufb %xmm12, %xmm6, %xmm14 +vpshufb %xmm12, %xmm8, %xmm15 +vpaddq %xmm14, %xmm4, %xmm8 +vpaddq %xmm15, %xmm7, %xmm6 +vpxor %xmm8, %xmm13, %xmm4 +vpxor %xmm6, %xmm11, %xmm11 +vpaddq %xmm4, %xmm4, %xmm10 +vpsrlq $63, %xmm4, %xmm7 +vpsrlq $63, %xmm11, %xmm13 +vpaddq %xmm11, %xmm11, %xmm4 +vpor %xmm10, %xmm7, %xmm3 +vpor %xmm4, %xmm13, %xmm11 +vpalignr $8, %xmm11, %xmm3, %xmm4 +vpalignr $8, %xmm3, %xmm11, %xmm7 +vpalignr $8, %xmm15, %xmm14, %xmm10 +vpalignr $8, %xmm14, %xmm15, %xmm9 +cmpq %rbx, %rcx +jb blake2b_blocks_avx_L23 +blake2b_blocks_avx_L24: +movq 32(%rsi), %r13 +movq (%rsi), %r10 +movq 48(%rsi), %r9 +vmovd %r13, %xmm13 +vpinsrq $1, %r9, %xmm13, %xmm14 +vmovd %r10, %xmm3 +movq 16(%rsi), %rbx +vpinsrq $1, %rbx, %xmm3, %xmm15 +vpaddq %xmm14, %xmm0, %xmm0 +vpaddq %xmm7, %xmm0, %xmm3 +vpxor %xmm3, %xmm10, %xmm10 +vpaddq %xmm15, %xmm2, %xmm2 +vpaddq %xmm4, %xmm2, %xmm5 +vpshufd $177, %xmm10, %xmm15 +vpxor %xmm5, %xmm9, %xmm9 +vpshufd $177, %xmm9, %xmm9 +vpaddq %xmm15, %xmm8, %xmm14 +vpaddq %xmm9, %xmm6, %xmm0 +vpxor %xmm14, %xmm7, %xmm7 +vpxor %xmm0, %xmm4, %xmm8 +vpshufb %xmm1, %xmm7, %xmm4 +vpshufb %xmm1, %xmm8, %xmm2 +vmovq 8(%rsi), %xmm7 +movq %r8, 288(%rsp) +movq 24(%rsi), %r8 +vpinsrq $1, %r8, %xmm7, %xmm6 +vpinsrq $1, %r10, %xmm7, %xmm7 +vpaddq %xmm6, %xmm5, %xmm13 +movq 40(%rsi), %rcx +movq 56(%rsi), %rdi +vpaddq %xmm2, %xmm13, %xmm13 +vmovd %rcx, %xmm5 +vpxor %xmm13, %xmm9, %xmm9 +vpinsrq $1, %rdi, %xmm5, %xmm10 +vpshufb %xmm12, %xmm9, %xmm5 +vpaddq %xmm10, %xmm3, %xmm3 +vpaddq %xmm4, %xmm3, %xmm11 +vpaddq %xmm5, %xmm0, %xmm3 +vpxor %xmm11, %xmm15, %xmm8 +vpshufb %xmm12, %xmm8, %xmm10 +vpaddq %xmm10, %xmm14, %xmm8 +vpxor %xmm3, %xmm2, %xmm14 +vpxor %xmm8, %xmm4, %xmm9 +vpsrlq $63, %xmm14, %xmm4 +vpsrlq $63, %xmm9, %xmm0 +vpaddq %xmm14, %xmm14, %xmm14 +movq 64(%rsi), %r15 +vpor %xmm14, %xmm4, %xmm6 +vpaddq %xmm9, %xmm9, %xmm4 +vmovq 96(%rsi), %xmm9 +vpor %xmm4, %xmm0, %xmm2 +movq 112(%rsi), %r14 +vmovd %r15, %xmm15 +vpinsrq $1, %r14, %xmm9, %xmm0 +vpinsrq $1, %rbx, %xmm9, %xmm9 +vpalignr $8, %xmm6, %xmm2, %xmm4 +vpalignr $8, %xmm2, %xmm6, %xmm2 +vpaddq %xmm0, %xmm11, %xmm11 +movq 80(%rsi), %r11 +vpinsrq $1, %r11, %xmm15, %xmm14 +vpaddq %xmm2, %xmm11, %xmm11 +vpalignr $8, %xmm10, %xmm5, %xmm15 +vpalignr $8, %xmm5, %xmm10, %xmm5 +vpxor %xmm11, %xmm5, %xmm10 +vpaddq %xmm14, %xmm13, %xmm13 +vpaddq %xmm4, %xmm13, %xmm6 +vpshufd $177, %xmm10, %xmm14 +vpxor %xmm6, %xmm15, %xmm13 +vpaddq %xmm14, %xmm3, %xmm0 +vpshufd $177, %xmm13, %xmm13 +vpaddq %xmm13, %xmm8, %xmm15 +vpxor %xmm0, %xmm2, %xmm8 +vpxor %xmm15, %xmm4, %xmm3 +vpshufb %xmm1, %xmm8, %xmm5 +vpshufb %xmm1, %xmm3, %xmm4 +vmovq 72(%rsi), %xmm8 +movq %rax, 296(%rsp) +movq 88(%rsi), %rax +vpinsrq $1, %rax, %xmm8, %xmm2 +movq 104(%rsi), %r12 +vpaddq %xmm2, %xmm6, %xmm6 +vpinsrq $1, %r12, %xmm8, %xmm8 +vmovd %r12, %xmm3 +vpaddq %xmm4, %xmm6, %xmm10 +vpxor %xmm10, %xmm13, %xmm13 +movq %rsi, 280(%rsp) +movq 120(%rsi), %rsi +vpinsrq $1, %rsi, %xmm3, %xmm6 +vpshufb %xmm12, %xmm13, %xmm3 +vpaddq %xmm6, %xmm11, %xmm11 +vpaddq %xmm5, %xmm11, %xmm6 +vpxor %xmm6, %xmm14, %xmm14 +vpshufb %xmm12, %xmm14, %xmm2 +vpaddq %xmm3, %xmm15, %xmm14 +vpaddq %xmm2, %xmm0, %xmm0 +vpaddq %xmm8, %xmm6, %xmm6 +vpxor %xmm14, %xmm4, %xmm4 +vpxor %xmm0, %xmm5, %xmm13 +vpsrlq $63, %xmm4, %xmm5 +vpsrlq $63, %xmm13, %xmm15 +vpaddq %xmm4, %xmm4, %xmm4 +vpaddq %xmm13, %xmm13, %xmm13 +vpor %xmm4, %xmm5, %xmm11 +vpor %xmm13, %xmm15, %xmm5 +vpalignr $8, %xmm5, %xmm11, %xmm15 +vmovd %r11, %xmm4 +vpalignr $8, %xmm11, %xmm5, %xmm5 +vmovd %r14, %xmm11 +vpinsrq $1, %r13, %xmm11, %xmm13 +vpinsrq $1, %r15, %xmm4, %xmm11 +vpaddq %xmm5, %xmm6, %xmm6 +vpaddq %xmm13, %xmm10, %xmm10 +vpaddq %xmm15, %xmm10, %xmm10 +vpalignr $8, %xmm3, %xmm2, %xmm13 +vpxor %xmm10, %xmm13, %xmm8 +vmovd %rsi, %xmm13 +vpshufd $177, %xmm8, %xmm8 +vpalignr $8, %xmm2, %xmm3, %xmm3 +vpxor %xmm6, %xmm3, %xmm2 +vpaddq %xmm8, %xmm0, %xmm3 +vpaddq %xmm11, %xmm10, %xmm10 +vpxor %xmm3, %xmm15, %xmm0 +vpshufd $177, %xmm2, %xmm2 +vpshufb %xmm1, %xmm0, %xmm0 +vpaddq %xmm2, %xmm14, %xmm14 +vpxor %xmm14, %xmm5, %xmm5 +vpshufb %xmm1, %xmm5, %xmm15 +vpaddq %xmm0, %xmm10, %xmm5 +vpinsrq $1, %r9, %xmm13, %xmm10 +vpaddq %xmm10, %xmm6, %xmm6 +vpaddq %xmm15, %xmm6, %xmm13 +vpxor %xmm5, %xmm8, %xmm10 +vpxor %xmm13, %xmm2, %xmm8 +vpshufb %xmm12, %xmm10, %xmm4 +vpshufb %xmm12, %xmm8, %xmm6 +vpaddq %xmm4, %xmm3, %xmm8 +vpaddq %xmm6, %xmm14, %xmm2 +vpxor %xmm8, %xmm0, %xmm14 +vpxor %xmm2, %xmm15, %xmm15 +vpaddq %xmm14, %xmm14, %xmm0 +vpsrlq $63, %xmm14, %xmm3 +vpsrlq $63, %xmm15, %xmm14 +vpor %xmm0, %xmm3, %xmm10 +vpaddq %xmm15, %xmm15, %xmm3 +vpor %xmm3, %xmm14, %xmm0 +vpaddq %xmm7, %xmm5, %xmm14 +vpalignr $8, %xmm10, %xmm0, %xmm11 +vmovd %rax, %xmm5 +vpaddq %xmm11, %xmm14, %xmm7 +vpinsrq $1, %rcx, %xmm5, %xmm14 +vpalignr $8, %xmm0, %xmm10, %xmm15 +vpaddq %xmm9, %xmm7, %xmm3 +vmovd %rdi, %xmm9 +vpinsrq $1, %r8, %xmm9, %xmm10 +vpaddq %xmm14, %xmm13, %xmm13 +vpaddq %xmm15, %xmm13, %xmm5 +vpalignr $8, %xmm6, %xmm4, %xmm13 +vpalignr $8, %xmm4, %xmm6, %xmm4 +vpxor %xmm7, %xmm13, %xmm14 +vpxor %xmm5, %xmm4, %xmm6 +vpshufd $177, %xmm14, %xmm13 +vpshufd $177, %xmm6, %xmm14 +vpaddq %xmm13, %xmm2, %xmm6 +vpaddq %xmm14, %xmm8, %xmm4 +vpaddq %xmm10, %xmm5, %xmm5 +vpxor %xmm6, %xmm11, %xmm2 +vpxor %xmm4, %xmm15, %xmm8 +vpshufb %xmm1, %xmm2, %xmm2 +vpshufb %xmm1, %xmm8, %xmm8 +vpaddq %xmm2, %xmm3, %xmm7 +vpaddq %xmm8, %xmm5, %xmm5 +vpxor %xmm7, %xmm13, %xmm13 +vpxor %xmm5, %xmm14, %xmm14 +vpshufb %xmm12, %xmm13, %xmm13 +vpshufb %xmm12, %xmm14, %xmm14 +vpaddq %xmm13, %xmm6, %xmm10 +vpaddq %xmm14, %xmm4, %xmm0 +vpxor %xmm10, %xmm2, %xmm2 +vpxor %xmm0, %xmm8, %xmm8 +vpaddq %xmm2, %xmm2, %xmm6 +vpaddq %xmm8, %xmm8, %xmm15 +vpsrlq $63, %xmm2, %xmm4 +vpsrlq $63, %xmm8, %xmm11 +vpor %xmm6, %xmm4, %xmm3 +vpor %xmm15, %xmm11, %xmm9 +vpxor %xmm0, %xmm7, %xmm0 +vpxor 208(%rsp), %xmm0, %xmm7 +vpxor %xmm10, %xmm5, %xmm0 +vpalignr $8, %xmm9, %xmm3, %xmm4 +vpalignr $8, %xmm13, %xmm14, %xmm5 +vpalignr $8, %xmm3, %xmm9, %xmm3 +vpxor %xmm5, %xmm4, %xmm6 +vpalignr $8, %xmm14, %xmm13, %xmm8 +vpxor %xmm8, %xmm3, %xmm9 +vmovdqu %xmm7, 208(%rsp) +vpxor 224(%rsp), %xmm0, %xmm2 +vpxor 256(%rsp), %xmm6, %xmm7 +vpxor 240(%rsp), %xmm9, %xmm10 +movq 296(%rsp), %rax +movq 288(%rsp), %r8 +movq 280(%rsp), %rsi +vmovdqu %xmm2, 224(%rsp) +vmovdqu %xmm7, 256(%rsp) +vmovdqu %xmm10, 240(%rsp) +cmpq $128, %rdx +ja blake2b_blocks_avx_L25 +blake2b_blocks_avx_L26: +vmovdqu 240(%rsp), %xmm8 +vmovdqu 256(%rsp), %xmm7 +vmovdqu 224(%rsp), %xmm6 +vmovdqu 208(%rsp), %xmm5 +movq 128(%rsp), %rdi +vmovdqu %xmm5, (%rdi) +vmovdqu %xmm6, 16(%rdi) +vmovdqu %xmm7, 32(%rdi) +vmovdqu %xmm8, 48(%rdi) +movq %r8, 64(%rdi) +movq %rax, 72(%rdi) +addq $344, %rsp +popq %rbx +popq %r15 +popq %r14 +popq %r13 +popq %r12 +movq %rbp, %rsp +popq %rbp +ret +FN_END blake2b_blocks_avx
\ No newline at end of file diff --git a/src/libcryptobox/blake2/blake2.c b/src/libcryptobox/blake2/blake2.c index f11eb33ef..9c3ce8c2b 100644 --- a/src/libcryptobox/blake2/blake2.c +++ b/src/libcryptobox/blake2/blake2.c @@ -50,11 +50,6 @@ typedef struct blake2b_impl_t { #define BLAKE2B_IMPL(cpuflags, desc, ext) \ {(cpuflags), desc, blake2b_blocks_##ext} -#if defined(HAVE_AVX2) -BLAKE2B_DECLARE(avx2) -#define BLAKE2B_AVX2 BLAKE2B_IMPL(CPUID_AVX2, "avx2", avx2) -#endif - #if defined(HAVE_AVX) BLAKE2B_DECLARE(avx) #define BLAKE2B_AVX BLAKE2B_IMPL(CPUID_AVX, "avx", avx) @@ -69,21 +64,15 @@ BLAKE2B_DECLARE(x86) BLAKE2B_DECLARE(ref) #define BLAKE2B_GENERIC BLAKE2B_IMPL(0, "generic", ref) - - /* list implemenations from most optimized to least, with generic as the last entry */ static const blake2b_impl_t blake2b_list[] = { - /* x86 */ -#if defined(BLAKE2B_AVX2) - BLAKE2B_AVX2, -#endif + BLAKE2B_GENERIC, #if defined(BLAKE2B_AVX) BLAKE2B_AVX, #endif #if defined(BLAKE2B_X86) BLAKE2B_X86, #endif - BLAKE2B_GENERIC }; static const blake2b_impl_t *blake2b_opt = &blake2b_list[0]; diff --git a/src/libcryptobox/blake2/constants.S b/src/libcryptobox/blake2/constants.S new file mode 100644 index 000000000..5d1a70813 --- /dev/null +++ b/src/libcryptobox/blake2/constants.S @@ -0,0 +1,30 @@ + +.p2align 6 +blake2b_constants: +.quad 0x6a09e667f3bcc908 +.quad 0xbb67ae8584caa73b +.quad 0x3c6ef372fe94f82b +.quad 0xa54ff53a5f1d36f1 +.quad 0x510e527fade682d1 +.quad 0x9b05688c2b3e6c1f +.quad 0x1f83d9abfb41bd6b +.quad 0x5be0cd19137e2179 + +blake2b_sigma: +.byte 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120 +.byte 112,80,32,64,72,120,104,48,8,96,0,16,88,56,40,24 +.byte 88,64,96,0,40,16,120,104,80,112,24,48,56,8,72,32 +.byte 56,72,24,8,104,96,88,112,16,48,40,80,32,0,120,64 +.byte 72,0,40,56,16,32,80,120,112,8,88,96,48,64,24,104 +.byte 16,96,48,80,0,88,64,24,32,104,56,40,120,112,8,72 +.byte 96,40,8,120,112,104,32,80,0,56,48,24,72,16,64,88 +.byte 104,88,56,112,96,8,24,72,40,0,120,32,64,48,16,80 +.byte 48,120,112,72,88,24,0,64,96,16,104,56,8,32,80,40 +.byte 80,16,64,32,56,48,8,40,120,88,72,112,24,96,104,0 +.byte 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120 +.byte 112,80,32,64,72,120,104,48,8,96,0,16,88,56,40,24 + +.p2align 4 +blake2b_constants_ssse3: +.byte 2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9 /* 64 bit rotate right by 16 */ +.byte 3,4,5,6,7,0,1,2,11,12,13,14,15,8,9,10 /* 64 bit rotate right by 24 */
\ No newline at end of file diff --git a/src/libcryptobox/blake2/ref.c b/src/libcryptobox/blake2/ref.c index 15b74351b..ed6f395fc 100644 --- a/src/libcryptobox/blake2/ref.c +++ b/src/libcryptobox/blake2/ref.c @@ -77,7 +77,7 @@ U64TO8 (unsigned char *p, blake2b_uint64 v) p[7] = (v >> 56) & 0xff; } -static void +void blake2b_blocks_ref (blake2b_state_internal *S, const unsigned char *in, size_t bytes, diff --git a/src/libcryptobox/blake2/x86-32.S b/src/libcryptobox/blake2/x86-32.S new file mode 100644 index 000000000..12030e57b --- /dev/null +++ b/src/libcryptobox/blake2/x86-32.S @@ -0,0 +1,1080 @@ +#include "../macro.S" +#include "constants.S" + +SECTION_TEXT + +GLOBAL_HIDDEN_FN blake2b_blocks_x86 +pushl %esi +pushl %edi +pushl %ebx +pushl %ebp +subl $492, %esp +movl 512(%esp), %eax +movl 80(%eax), %ebp +movl %ebp, %edi +movl 84(%eax), %ebx +xorl $-79577749, %edi +movl %edi, 144(%esp) +movl %ebx, %edi +xorl $528734635, %edi +movl %edi, 148(%esp) +movl 88(%eax), %edi +movl 92(%eax), %eax +xorl $327033209, %edi +xorl $1541459225, %eax +movl %edi, 152(%esp) +LOAD_VAR_PIC blake2b_sigma, %ecx +lea 192(%ecx), %edi +movl 516(%esp), %esi +orl %ebx, %ebp +movl 520(%esp), %edx +movl %edi, 360(%esp) +jne blake2b_blocks_x86_L2 +blake2b_blocks_x86_L32: +cmpl $128, %edx +jmp blake2b_blocks_x86_L21 +blake2b_blocks_x86_L2: +cmpl $128, %edx +je blake2b_blocks_x86_L21 +blake2b_blocks_x86_L3: +testb $64, %dl +lea (%esp), %ebp +je blake2b_blocks_x86_L5 +blake2b_blocks_x86_L4: +movl (%esi), %ebx +movl 4(%esi), %ebp +movl %ebx, (%esp) +movl %ebp, 4(%esp) +movl 8(%esi), %edi +movl 12(%esi), %ebx +movl %edi, 8(%esp) +movl %ebx, 12(%esp) +movl 16(%esi), %ebp +movl 20(%esi), %edi +movl %ebp, 16(%esp) +movl %edi, 20(%esp) +movl 24(%esi), %ebx +movl 28(%esi), %ebp +movl %ebx, 24(%esp) +movl %ebp, 28(%esp) +movl 32(%esi), %edi +movl 36(%esi), %ebx +movl %edi, 32(%esp) +movl %ebx, 36(%esp) +movl 40(%esi), %ebp +movl 44(%esi), %edi +movl %ebp, 40(%esp) +movl %edi, 44(%esp) +movl 48(%esi), %ebx +movl 52(%esi), %ebp +movl %ebx, 48(%esp) +movl %ebp, 52(%esp) +lea 64(%esp), %ebp +movl 56(%esi), %edi +movl 60(%esi), %ebx +addl $64, %esi +movl %edi, 56(%esp) +movl %ebx, 60(%esp) +jmp blake2b_blocks_x86_L6 +blake2b_blocks_x86_L5: +xorl %ebx, %ebx +movl %ebx, 64(%esp) +movl %ebx, 68(%esp) +movl %ebx, 72(%esp) +movl %ebx, 76(%esp) +movl %ebx, 80(%esp) +movl %ebx, 84(%esp) +movl %ebx, 88(%esp) +movl %ebx, 92(%esp) +movl %ebx, 96(%esp) +movl %ebx, 100(%esp) +movl %ebx, 104(%esp) +movl %ebx, 108(%esp) +movl %ebx, 112(%esp) +movl %ebx, 116(%esp) +movl %ebx, 120(%esp) +movl %ebx, 124(%esp) +blake2b_blocks_x86_L6: +xorl %ebx, %ebx +testb $32, %dl +movl %ebx, (%ebp) +movl %ebx, 4(%ebp) +movl %ebx, 8(%ebp) +movl %ebx, 12(%ebp) +movl %ebx, 16(%ebp) +movl %ebx, 20(%ebp) +movl %ebx, 24(%ebp) +movl %ebx, 28(%ebp) +movl %ebx, 32(%ebp) +movl %ebx, 36(%ebp) +movl %ebx, 40(%ebp) +movl %ebx, 44(%ebp) +movl %ebx, 48(%ebp) +movl %ebx, 52(%ebp) +movl %ebx, 56(%ebp) +movl %ebx, 60(%ebp) +je blake2b_blocks_x86_L8 +blake2b_blocks_x86_L7: +movl (%esi), %ebx +movl 4(%esi), %edi +movl %ebx, (%ebp) +movl %edi, 4(%ebp) +movl 8(%esi), %ebx +movl 12(%esi), %edi +movl %ebx, 8(%ebp) +movl %edi, 12(%ebp) +movl 16(%esi), %ebx +movl 20(%esi), %edi +movl %ebx, 16(%ebp) +movl %edi, 20(%ebp) +movl 24(%esi), %ebx +movl 28(%esi), %edi +addl $32, %esi +movl %ebx, 24(%ebp) +movl %edi, 28(%ebp) +addl $32, %ebp +blake2b_blocks_x86_L8: +testb $16, %dl +je blake2b_blocks_x86_L10 +blake2b_blocks_x86_L9: +movl (%esi), %ebx +movl 4(%esi), %edi +movl %ebx, (%ebp) +movl %edi, 4(%ebp) +movl 8(%esi), %ebx +movl 12(%esi), %edi +addl $16, %esi +movl %ebx, 8(%ebp) +movl %edi, 12(%ebp) +addl $16, %ebp +blake2b_blocks_x86_L10: +testb $8, %dl +je blake2b_blocks_x86_L12 +blake2b_blocks_x86_L11: +movl (%esi), %ebx +movl 4(%esi), %edi +addl $8, %esi +movl %ebx, (%ebp) +movl %edi, 4(%ebp) +addl $8, %ebp +blake2b_blocks_x86_L12: +testb $4, %dl +je blake2b_blocks_x86_L14 +blake2b_blocks_x86_L13: +movl (%esi), %ebx +addl $4, %esi +movl %ebx, (%ebp) +addl $4, %ebp +blake2b_blocks_x86_L14: +testb $2, %dl +je blake2b_blocks_x86_L16 +blake2b_blocks_x86_L15: +movzwl (%esi), %ebx +addl $2, %esi +movw %bx, (%ebp) +addl $2, %ebp +blake2b_blocks_x86_L16: +testb $1, %dl +je blake2b_blocks_x86_L18 +blake2b_blocks_x86_L17: +movzbl (%esi), %ebx +movb %bl, (%ebp) +blake2b_blocks_x86_L18: +cmpl $128, %edx +lea (%esp), %esi +blake2b_blocks_x86_L21: +movl 512(%esp), %ebp +lea (%ecx), %ecx +movl %esi, 236(%esp) +movl %ecx, 128(%esp) +movl 68(%ebp), %edi +movl %edi, 228(%esp) +movl 60(%ebp), %edi +movl %edi, 196(%esp) +movl 72(%ebp), %edi +movl %edi, 164(%esp) +movl 76(%ebp), %edi +movl %edi, 200(%esp) +movl 24(%ebp), %edi +movl %edi, 176(%esp) +movl 28(%ebp), %edi +movl %edi, 208(%esp) +movl 16(%ebp), %edi +movl %edi, 184(%esp) +movl 20(%ebp), %edi +movl %edi, 216(%esp) +movl 48(%ebp), %edi +movl %edi, 168(%esp) +movl 52(%ebp), %edi +movl %edi, 204(%esp) +movl 8(%ebp), %edi +movl 64(%ebp), %ebx +movl %edi, 156(%esp) +movl 12(%ebp), %edi +movl %ebx, 192(%esp) +movl 56(%ebp), %ebx +movl %edi, 224(%esp) +movl 40(%ebp), %edi +movl %ebx, 172(%esp) +movl %edx, %ebx +movl %edi, 160(%esp) +movl 44(%ebp), %edi +jbe blake2b_blocks_x86_LL3 +movl $128, %ebx +blake2b_blocks_x86_LL3: +movl %edi, 212(%esp) +movl (%ebp), %edi +movl %edi, 180(%esp) +movl 4(%ebp), %edi +movl %edi, 232(%esp) +movl 32(%ebp), %edi +movl 36(%ebp), %ebp +movl %edi, 188(%esp) +movl %ebp, 220(%esp) +movl %eax, 132(%esp) +movl %ebx, 136(%esp) +movl %edx, 140(%esp) +movl 512(%esp), %esi +jmp blake2b_blocks_x86_L22 +blake2b_blocks_x86_L28: +movl 524(%esp), %eax +movl 140(%esp), %edx +addl $-128, %edx +addl %eax, 236(%esp) +movl %edx, 140(%esp) +blake2b_blocks_x86_L22: +movl 136(%esp), %edx +xorl %ebx, %ebx +movl 192(%esp), %eax +addl %edx, %eax +movl 228(%esp), %ecx +adcl $0, %ecx +movl %eax, 192(%esp) +movl %eax, 64(%esi) +subl %edx, %eax +movl %ecx, 228(%esp) +movl %ecx, 68(%esi) +sbbl %ebx, %ecx +jae blake2b_blocks_x86_L25 +blake2b_blocks_x86_L23: +movl 164(%esp), %eax +addl $1, %eax +movl 200(%esp), %edx +adcl $0, %edx +movl %eax, 164(%esp) +movl %edx, 200(%esp) +movl %eax, 72(%esi) +movl %edx, 76(%esi) +blake2b_blocks_x86_L25: +movl 152(%esp), %eax +movl %eax, 312(%esp) +movl 172(%esp), %ebp +movl 196(%esp), %ebx +movl 144(%esp), %eax +movl 184(%esp), %edi +movl %ebp, 284(%esp) +movl %ebx, 288(%esp) +movl %eax, 296(%esp) +movl 168(%esp), %ebp +movl 204(%esp), %ebx +movl 212(%esp), %eax +movl %edi, 332(%esp) +movl %ebp, 276(%esp) +movl %ebx, 280(%esp) +movl 148(%esp), %edi +movl %eax, 272(%esp) +movl 224(%esp), %ebp +movl 160(%esp), %ebx +movl 188(%esp), %eax +movl 208(%esp), %ecx +movl %edi, 300(%esp) +movl %ebp, 248(%esp) +movl %ebx, 268(%esp) +movl 180(%esp), %edi +movl %eax, 260(%esp) +movl 176(%esp), %edx +movl 164(%esp), %ebp +movl 232(%esp), %ebx +xorl $725511199, %ebp +movl 128(%esp), %eax +movl %ebp, 348(%esp) +movl %ecx, 256(%esp) +movl 200(%esp), %ebp +movl 216(%esp), %ecx +xorl $-1694144372, %ebp +movl %edi, 240(%esp) +movl %edx, 316(%esp) +movl %ebx, 244(%esp) +movl 220(%esp), %edi +movl %eax, 292(%esp) +movl 192(%esp), %ebx +xorl $-1377402159, %ebx +movl %ebx, 352(%esp) +movl %ecx, 252(%esp) +movl 228(%esp), %ebx +movl %ebp, 356(%esp) +xorl $1359893119, %ebx +movl 132(%esp), %edx +movl 156(%esp), %ecx +movl 332(%esp), %ebp +movl 316(%esp), %esi +movl %edi, 264(%esp) +movl $1595750129, 308(%esp) +movl $-1521486534, 304(%esp) +movl $-23791573, 324(%esp) +movl $1013904242, 320(%esp) +movl $-2067093701, 340(%esp) +movl $-1150833019, 336(%esp) +movl $-205731576, 328(%esp) +movl $1779033703, 344(%esp) +blake2b_blocks_x86_L26: +movl %esi, 316(%esp) +movl %edx, 368(%esp) +movzbl (%eax), %esi +movl 236(%esp), %edx +movl %ecx, 364(%esp) +movl 240(%esp), %ecx +addl (%esi,%edx), %ecx +movl %ebp, 332(%esp) +movl 244(%esp), %ebp +adcl 4(%esi,%edx), %ebp +movl 260(%esp), %edx +addl %edx, %ecx +movl 264(%esp), %esi +adcl %esi, %ebp +xorl %ebp, %ebx +movl 352(%esp), %edi +movl %ecx, 240(%esp) +xorl %ecx, %edi +movl 328(%esp), %ecx +addl %ebx, %ecx +movl %ebx, 372(%esp) +movl 344(%esp), %ebx +adcl %edi, %ebx +xorl %ecx, %edx +xorl %ebx, %esi +movl %edi, 352(%esp) +movl %edx, %edi +movl %ecx, 328(%esp) +movl %esi, %ecx +shrl $24, %esi +shll $8, %edx +orl %edx, %esi +movl %esi, 264(%esp) +movzbl 2(%eax), %edx +movl 236(%esp), %esi +shll $8, %ecx +shrl $24, %edi +orl %edi, %ecx +movl %ecx, 376(%esp) +movl 364(%esp), %ecx +addl (%edx,%esi), %ecx +movl 248(%esp), %edi +movl %ebp, 244(%esp) +movl 268(%esp), %ebp +adcl 4(%edx,%esi), %edi +addl %ebp, %ecx +movl 272(%esp), %edx +adcl %edx, %edi +movl %ebx, 344(%esp) +movl %ecx, 364(%esp) +movl 348(%esp), %ebx +xorl %ecx, %ebx +movl 356(%esp), %ecx +xorl %edi, %ecx +movl %edi, 248(%esp) +movl 340(%esp), %edi +addl %ecx, %edi +movl %ecx, 356(%esp) +movl 336(%esp), %ecx +adcl %ebx, %ecx +xorl %edi, %ebp +xorl %ecx, %edx +movl %ebx, 348(%esp) +movl %edx, %ebx +movl %edi, 340(%esp) +movl %ebp, %edi +shrl $24, %edx +shll $8, %ebp +orl %ebp, %edx +movzbl 4(%eax), %ebp +movl %ecx, 336(%esp) +shll $8, %ebx +shrl $24, %edi +movl 332(%esp), %ecx +orl %edi, %ebx +addl (%ebp,%esi), %ecx +movl 252(%esp), %edi +adcl 4(%ebp,%esi), %edi +movl 276(%esp), %ebp +addl %ebp, %ecx +movl %edx, 272(%esp) +movl 280(%esp), %edx +adcl %edx, %edi +movl %ebx, 380(%esp) +movl %ecx, 332(%esp) +movl 296(%esp), %ebx +xorl %ecx, %ebx +movl 300(%esp), %ecx +xorl %edi, %ecx +movl %edi, 252(%esp) +movl 324(%esp), %edi +addl %ecx, %edi +movl %ecx, 300(%esp) +movl 320(%esp), %ecx +adcl %ebx, %ecx +xorl %edi, %ebp +xorl %ecx, %edx +movl %ebx, 296(%esp) +movl %edx, %ebx +movl %edi, 324(%esp) +movl %ebp, %edi +shrl $24, %edx +shll $8, %ebp +orl %ebp, %edx +movl %edx, 280(%esp) +movzbl 6(%eax), %edx +movl %ecx, 320(%esp) +shll $8, %ebx +shrl $24, %edi +movl 316(%esp), %ecx +orl %edi, %ebx +addl (%edx,%esi), %ecx +movl 256(%esp), %edi +movl 284(%esp), %ebp +adcl 4(%edx,%esi), %edi +addl %ebp, %ecx +movl 288(%esp), %edx +adcl %edx, %edi +movl %ebx, 384(%esp) +movl %ecx, 316(%esp) +movl 312(%esp), %ebx +xorl %ecx, %ebx +movl 368(%esp), %ecx +xorl %edi, %ecx +movl %edi, 256(%esp) +movl 308(%esp), %edi +addl %ecx, %edi +movl %ecx, 368(%esp) +movl 304(%esp), %ecx +adcl %ebx, %ecx +xorl %edi, %ebp +xorl %ecx, %edx +movl %ebx, 312(%esp) +movl %edx, %ebx +movl %edi, 308(%esp) +movl %ebp, %edi +shrl $24, %edx +shll $8, %ebp +orl %ebp, %edx +movzbl 5(%eax), %ebp +movl %ecx, 304(%esp) +shll $8, %ebx +movl (%ebp,%esi), %ecx +addl 332(%esp), %ecx +movl 4(%ebp,%esi), %esi +adcl 252(%esp), %esi +shrl $24, %edi +orl %edi, %ebx +movl %ebx, 388(%esp) +movl 384(%esp), %ebx +addl %ebx, %ecx +movl %edx, 288(%esp) +movl 280(%esp), %edx +adcl %edx, %esi +movl 300(%esp), %ebp +movl 296(%esp), %edi +xorl %ecx, %ebp +xorl %esi, %edi +movl %ecx, 392(%esp) +movl %ebp, %ecx +movl %esi, 396(%esp) +movl %edi, %esi +shll $16, %esi +shrl $16, %ecx +shrl $16, %edi +orl %ecx, %esi +shll $16, %ebp +orl %ebp, %edi +movl 324(%esp), %ebp +addl %esi, %ebp +movl %esi, 400(%esp) +movl 320(%esp), %esi +adcl %edi, %esi +xorl %ebp, %ebx +xorl %esi, %edx +movl %esi, 320(%esp) +movl %edx, %esi +movl %edi, 296(%esp) +movl %ebx, %edi +shrl $31, %esi +addl %ebx, %ebx +shrl $31, %edi +addl %edx, %edx +orl %ebx, %esi +orl %edx, %edi +movl %esi, 408(%esp) +movzbl 7(%eax), %edx +movl 236(%esp), %esi +movl %edi, 404(%esp) +movl 288(%esp), %edi +movl (%edx,%esi), %ebx +addl 316(%esp), %ebx +movl 4(%edx,%esi), %ecx +movl 388(%esp), %edx +adcl 256(%esp), %ecx +addl %edx, %ebx +movl %ebp, 324(%esp) +adcl %edi, %ecx +movl 368(%esp), %ebp +movl 312(%esp), %esi +xorl %ebx, %ebp +xorl %ecx, %esi +movl %ebx, 412(%esp) +movl %ebp, %ebx +movl %ecx, 416(%esp) +movl %esi, %ecx +shll $16, %ecx +shrl $16, %ebx +shrl $16, %esi +orl %ebx, %ecx +shll $16, %ebp +orl %ebp, %esi +movl 308(%esp), %ebp +addl %ecx, %ebp +movl %ecx, 420(%esp) +movl 304(%esp), %ecx +adcl %esi, %ecx +xorl %ebp, %edx +movl %esi, 312(%esp) +xorl %ecx, %edi +movl %edx, %esi +movl %edi, %ebx +shrl $31, %esi +addl %edi, %edi +orl %edi, %esi +addl %edx, %edx +movl %esi, 424(%esp) +movzbl 3(%eax), %edi +movl 236(%esp), %esi +shrl $31, %ebx +orl %edx, %ebx +movl (%edi,%esi), %edx +addl 364(%esp), %edx +movl %ecx, 304(%esp) +movl 4(%edi,%esi), %ecx +movl 380(%esp), %edi +adcl 248(%esp), %ecx +addl %edi, %edx +movl 272(%esp), %esi +adcl %esi, %ecx +movl %ebp, 308(%esp) +movl %ebx, 428(%esp) +movl 356(%esp), %ebx +movl 348(%esp), %ebp +xorl %edx, %ebx +xorl %ecx, %ebp +movl %edx, 432(%esp) +movl %ebp, %edx +movl %ecx, 436(%esp) +movl %ebx, %ecx +shll $16, %edx +shrl $16, %ecx +shrl $16, %ebp +orl %ecx, %edx +shll $16, %ebx +orl %ebx, %ebp +movl 340(%esp), %ebx +addl %edx, %ebx +movl %edx, 440(%esp) +movl 336(%esp), %edx +adcl %ebp, %edx +xorl %ebx, %edi +movl %ebx, 340(%esp) +xorl %edx, %esi +movl %edi, %ebx +movl %esi, %ecx +shrl $31, %ebx +addl %esi, %esi +movl %edx, 336(%esp) +orl %esi, %ebx +movzbl 1(%eax), %esi +addl %edi, %edi +movl 236(%esp), %edx +shrl $31, %ecx +orl %edi, %ecx +movl (%esi,%edx), %edi +addl 240(%esp), %edi +movl %ebp, 348(%esp) +movl 4(%esi,%edx), %ebp +movl 376(%esp), %esi +adcl 244(%esp), %ebp +addl %esi, %edi +movl %ecx, 448(%esp) +movl 264(%esp), %ecx +adcl %ecx, %ebp +movl %ebx, 444(%esp) +movl 372(%esp), %ebx +movl 352(%esp), %edx +xorl %edi, %ebx +xorl %ebp, %edx +movl %edi, 452(%esp) +movl %edx, %edi +movl %ebp, 456(%esp) +movl %ebx, %ebp +shll $16, %edi +shrl $16, %ebp +shrl $16, %edx +orl %ebp, %edi +shll $16, %ebx +orl %ebx, %edx +movl 328(%esp), %ebx +addl %edi, %ebx +movl %edi, 460(%esp) +movl 344(%esp), %edi +adcl %edx, %edi +xorl %ebx, %esi +movl %edx, 352(%esp) +xorl %edi, %ecx +movl %esi, %edx +addl %esi, %esi +movl %ebx, 328(%esp) +movl %ecx, %ebx +shrl $31, %edx +addl %ecx, %ecx +movl %edi, 344(%esp) +orl %ecx, %edx +movzbl 8(%eax), %edi +movl 236(%esp), %ecx +shrl $31, %ebx +orl %esi, %ebx +movl %ebx, 468(%esp) +movl 452(%esp), %ebx +addl (%edi,%ecx), %ebx +movl 456(%esp), %esi +movl %edx, 464(%esp) +movl 448(%esp), %edx +adcl 4(%edi,%ecx), %esi +addl %edx, %ebx +movl 444(%esp), %edi +adcl %edi, %esi +movl 420(%esp), %ebp +movl %ebx, 452(%esp) +xorl %ebx, %ebp +movl 312(%esp), %ebx +xorl %esi, %ebx +movl %esi, 456(%esp) +movl 324(%esp), %esi +addl %ebx, %esi +movl %ebx, 312(%esp) +movl 320(%esp), %ebx +adcl %ebp, %ebx +xorl %esi, %edx +xorl %ebx, %edi +movl %ebp, 420(%esp) +movzbl 10(%eax), %ebp +movl %esi, 324(%esp) +movl %edx, %esi +movl %ebx, 320(%esp) +movl %edi, %ebx +shll $8, %ebx +shrl $24, %esi +orl %esi, %ebx +movl %ebx, 472(%esp) +movl (%ebp,%ecx), %ebx +addl 432(%esp), %ebx +movl 4(%ebp,%ecx), %esi +adcl 436(%esp), %esi +shrl $24, %edi +shll $8, %edx +orl %edx, %edi +movl 408(%esp), %edx +addl %edx, %ebx +movl %edi, 444(%esp) +movl 404(%esp), %edi +adcl %edi, %esi +movl 460(%esp), %ebp +movl %ebx, 364(%esp) +xorl %ebx, %ebp +movl 352(%esp), %ebx +xorl %esi, %ebx +movl %esi, 248(%esp) +movl 308(%esp), %esi +addl %ebx, %esi +movl %ebx, 352(%esp) +movl 304(%esp), %ebx +adcl %ebp, %ebx +xorl %esi, %edx +xorl %ebx, %edi +movl %esi, 308(%esp) +movl %edx, %esi +movl %ebx, 304(%esp) +movl %edi, %ebx +shrl $24, %edi +shll $8, %edx +orl %edx, %edi +movl %edi, 404(%esp) +movzbl 12(%eax), %edi +movl %ebp, 460(%esp) +shll $8, %ebx +shrl $24, %esi +movl (%edi,%ecx), %ebp +orl %esi, %ebx +addl 392(%esp), %ebp +movl 4(%edi,%ecx), %esi +movl 428(%esp), %edx +adcl 396(%esp), %esi +addl %edx, %ebp +movl %ebx, 476(%esp) +movl 424(%esp), %ebx +adcl %ebx, %esi +movl 440(%esp), %edi +movl %ebp, 332(%esp) +xorl %ebp, %edi +movl 348(%esp), %ebp +xorl %esi, %ebp +movl %esi, 252(%esp) +movl 328(%esp), %esi +addl %ebp, %esi +movl %ebp, 348(%esp) +movl 344(%esp), %ebp +adcl %edi, %ebp +xorl %esi, %edx +xorl %ebp, %ebx +movl %esi, 328(%esp) +movl %edx, %esi +movl %ebp, 344(%esp) +movl %ebx, %ebp +shrl $24, %ebx +shll $8, %edx +orl %edx, %ebx +movzbl 14(%eax), %edx +movl %eax, 292(%esp) +shll $8, %ebp +shrl $24, %esi +movl (%edx,%ecx), %eax +orl %esi, %ebp +addl 412(%esp), %eax +movl 4(%edx,%ecx), %esi +movl 468(%esp), %ecx +adcl 416(%esp), %esi +addl %ecx, %eax +movl 464(%esp), %edx +adcl %edx, %esi +movl %edi, 440(%esp) +movl %eax, 316(%esp) +movl 400(%esp), %edi +xorl %eax, %edi +movl 296(%esp), %eax +xorl %esi, %eax +movl %esi, 256(%esp) +movl 340(%esp), %esi +addl %eax, %esi +movl %eax, 296(%esp) +movl 336(%esp), %eax +adcl %edi, %eax +xorl %esi, %ecx +xorl %eax, %edx +movl %edi, 400(%esp) +movl %ecx, %edi +movl %esi, 340(%esp) +movl %edx, %esi +shrl $24, %edx +shll $8, %ecx +orl %ecx, %edx +movl %edx, 464(%esp) +movl 292(%esp), %edx +shll $8, %esi +shrl $24, %edi +orl %edi, %esi +movzbl 13(%edx), %edi +movl 236(%esp), %edx +movl 332(%esp), %ecx +addl %ebp, %ecx +movl %eax, 336(%esp) +movl 252(%esp), %eax +adcl %ebx, %eax +addl (%edi,%edx), %ecx +movl %ecx, 332(%esp) +adcl 4(%edi,%edx), %eax +movl 348(%esp), %edi +movl 440(%esp), %edx +xorl %ecx, %edi +xorl %eax, %edx +movl %edi, %ecx +movl %eax, 252(%esp) +movl %edx, %eax +shll $16, %eax +shrl $16, %ecx +shrl $16, %edx +orl %ecx, %eax +shll $16, %edi +orl %edx, %edi +movl 328(%esp), %edx +addl %eax, %edx +movl %eax, 348(%esp) +movl 344(%esp), %eax +adcl %edi, %eax +xorl %edx, %ebp +xorl %eax, %ebx +movl %eax, 344(%esp) +movl %ebx, %eax +movl %edi, 356(%esp) +movl %ebp, %edi +shrl $31, %eax +addl %ebp, %ebp +orl %ebp, %eax +addl %ebx, %ebx +movl %eax, 284(%esp) +movl 292(%esp), %eax +shrl $31, %edi +orl %ebx, %edi +movl %edi, 288(%esp) +movzbl 15(%eax), %ebx +movl 236(%esp), %edi +movl 316(%esp), %ebp +addl %esi, %ebp +movl %edx, 328(%esp) +movl 256(%esp), %edx +movl 464(%esp), %ecx +adcl %ecx, %edx +addl (%ebx,%edi), %ebp +movl %ebp, 316(%esp) +adcl 4(%ebx,%edi), %edx +movl 296(%esp), %edi +movl 400(%esp), %ebx +xorl %ebp, %edi +xorl %edx, %ebx +movl %edi, %ebp +movl %edx, 256(%esp) +movl %ebx, %edx +shll $16, %edx +shrl $16, %ebp +shrl $16, %ebx +orl %ebp, %edx +shll $16, %edi +orl %ebx, %edi +movl 340(%esp), %ebx +addl %edx, %ebx +movl %edx, 296(%esp) +movl 336(%esp), %edx +adcl %edi, %edx +xorl %ebx, %esi +xorl %edx, %ecx +movl %edx, 336(%esp) +movl %ecx, %edx +movl %edi, 300(%esp) +movl %esi, %edi +shrl $31, %edx +addl %esi, %esi +shrl $31, %edi +addl %ecx, %ecx +movl %ebx, 340(%esp) +orl %esi, %edx +movzbl 11(%eax), %ebp +orl %ecx, %edi +movl 236(%esp), %ebx +movl %edx, 260(%esp) +movl 364(%esp), %ecx +movl 476(%esp), %edx +addl %edx, %ecx +movl %edi, 264(%esp) +movl 248(%esp), %edi +movl 404(%esp), %esi +adcl %esi, %edi +addl (%ebp,%ebx), %ecx +movl %ecx, 364(%esp) +adcl 4(%ebp,%ebx), %edi +movl 352(%esp), %ebp +movl 460(%esp), %ebx +xorl %ecx, %ebp +xorl %edi, %ebx +movl %ebp, %ecx +movl %edi, 248(%esp) +movl %ebx, %edi +shll $16, %edi +shrl $16, %ecx +shrl $16, %ebx +orl %ecx, %edi +shll $16, %ebp +orl %ebx, %ebp +movl 308(%esp), %ebx +addl %edi, %ebx +movl %edi, 352(%esp) +movl 304(%esp), %edi +adcl %ebp, %edi +xorl %ebx, %edx +xorl %edi, %esi +movl %edi, 304(%esp) +movl %esi, %edi +movl %ebp, 372(%esp) +movl %edx, %ebp +shrl $31, %edi +addl %edx, %edx +shrl $31, %ebp +addl %esi, %esi +movzbl 9(%eax), %ecx +orl %edx, %edi +movl 236(%esp), %edx +orl %esi, %ebp +movl %ebx, 308(%esp) +addl $16, %eax +movl %edi, 276(%esp) +movl 452(%esp), %ebx +movl 472(%esp), %edi +addl %edi, %ebx +movl %ebp, 280(%esp) +movl 456(%esp), %ebp +movl 444(%esp), %esi +adcl %esi, %ebp +addl (%ecx,%edx), %ebx +movl %ebx, 240(%esp) +adcl 4(%ecx,%edx), %ebp +movl 312(%esp), %edx +movl 420(%esp), %ecx +xorl %ebx, %edx +xorl %ebp, %ecx +movl %ebp, 244(%esp) +movl %ecx, %ebx +movl %edx, %ebp +shll $16, %ebx +shrl $16, %ebp +shrl $16, %ecx +orl %ebp, %ebx +shll $16, %edx +orl %ecx, %edx +movl 324(%esp), %ecx +addl %ebx, %ecx +movl %ebx, 312(%esp) +movl 320(%esp), %ebx +adcl %edx, %ebx +xorl %ecx, %edi +xorl %ebx, %esi +movl %edi, %ebp +movl %ecx, 324(%esp) +movl %esi, %ecx +shrl $31, %ecx +addl %edi, %edi +shrl $31, %ebp +addl %esi, %esi +orl %esi, %ebp +orl %edi, %ecx +movl %ebx, 320(%esp) +movl %ebp, 272(%esp) +movl %ecx, 268(%esp) +movl 332(%esp), %ebp +movl 316(%esp), %esi +movl 364(%esp), %ecx +movl 372(%esp), %ebx +cmpl 360(%esp), %eax +jb blake2b_blocks_x86_L26 +blake2b_blocks_x86_L27: +movl 328(%esp), %edi +xorl 240(%esp), %edi +movl %esi, 316(%esp) +movl 512(%esp), %esi +movl 180(%esp), %eax +movl %edx, 368(%esp) +xorl %edi, %eax +movl 344(%esp), %edx +movl %eax, 180(%esp) +movl %eax, (%esi) +movl 340(%esp), %eax +xorl %ecx, %eax +movl 336(%esp), %ecx +xorl 244(%esp), %edx +xorl 248(%esp), %ecx +movl 232(%esp), %edi +xorl %edx, %edi +movl 156(%esp), %edx +xorl %eax, %edx +movl 224(%esp), %eax +movl %edi, 232(%esp) +xorl %ecx, %eax +movl %edi, 4(%esi) +movl %ebp, 332(%esp) +movl %eax, 224(%esp) +movl %eax, 12(%esi) +movl 324(%esp), %edi +movl 320(%esp), %eax +xorl 332(%esp), %edi +xorl 252(%esp), %eax +movl %edx, 156(%esp) +movl %edx, 8(%esi) +movl 184(%esp), %edx +movl 216(%esp), %ecx +xorl %edi, %edx +movl %edx, 184(%esp) +xorl %eax, %ecx +movl %edx, 16(%esi) +movl 308(%esp), %eax +movl 304(%esp), %edx +xorl 316(%esp), %eax +xorl 256(%esp), %edx +movl 176(%esp), %edi +xorl 264(%esp), %ebx +xorl %eax, %edi +movl 208(%esp), %eax +xorl %edx, %eax +movl %eax, 208(%esp) +movl %eax, 28(%esi) +movl 352(%esp), %edx +movl 220(%esp), %eax +movl 356(%esp), %ebp +xorl %ebx, %eax +movl 348(%esp), %ebx +xorl 260(%esp), %edx +xorl 268(%esp), %ebx +xorl 272(%esp), %ebp +movl %ecx, 216(%esp) +movl %ecx, 20(%esi) +movl 188(%esp), %ecx +movl %eax, 220(%esp) +xorl %edx, %ecx +movl %eax, 36(%esi) +movl 160(%esp), %eax +movl 212(%esp), %edx +xorl %ebx, %eax +xorl %ebp, %edx +movl 296(%esp), %ebp +movl %eax, 160(%esp) +movl %eax, 40(%esi) +movl %edi, 176(%esp) +movl %edi, 24(%esi) +movl 300(%esp), %eax +movl 312(%esp), %ebx +movl 368(%esp), %edi +xorl 276(%esp), %ebp +xorl 280(%esp), %eax +xorl 284(%esp), %ebx +xorl 288(%esp), %edi +movl %edx, 212(%esp) +movl %edx, 44(%esi) +movl 168(%esp), %edx +movl %ecx, 188(%esp) +xorl %ebp, %edx +movl %ecx, 32(%esi) +movl %edx, 168(%esp) +movl 204(%esp), %ecx +movl %edx, 48(%esi) +xorl %eax, %ecx +movl 172(%esp), %eax +movl 196(%esp), %edx +xorl %ebx, %eax +xorl %edi, %edx +movl %ecx, 204(%esp) +movl %ecx, 52(%esi) +movl %eax, 172(%esp) +movl %edx, 196(%esp) +movl %eax, 56(%esi) +movl %edx, 60(%esi) +cmpl $128, 140(%esp) +ja blake2b_blocks_x86_L28 +blake2b_blocks_x86_L29: +addl $492, %esp +popl %ebp +popl %ebx +popl %edi +popl %esi +ret +FN_END blake2b_blocks_x86 diff --git a/src/libcryptobox/blake2/x86-64.S b/src/libcryptobox/blake2/x86-64.S new file mode 100644 index 000000000..f0de795fb --- /dev/null +++ b/src/libcryptobox/blake2/x86-64.S @@ -0,0 +1,1754 @@ +#include "../macro.S" +#include "constants.S" + +SECTION_TEXT + +GLOBAL_HIDDEN_FN_EXT blake2b_blocks_x86, 4, 8 +pushq %rbx +pushq %rbp +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +movq %rsp, %r9 +subq $320, %rsp +andq $~63, %rsp +cmpq $128, %rdx +movq %rdx, %rax +jb blake2b_blocks_x86_usebytesinc +movq $128, %rax +blake2b_blocks_x86_usebytesinc: +movq %rdx, 136(%rsp) +movq %rcx, 144(%rsp) +movq %rax, 152(%rsp) +movq %rdi, 160(%rsp) +movq %r9, 168(%rsp) +movq 80(%rdi), %rcx +andq %rcx, %rcx +jz blake2b_blocks_x86_not_final_call +cmpq $128, %rdx +je blake2b_blocks_x86_not_final_call +leaq 0(%rsp), %rcx +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%rcx) +movdqa %xmm0, 16(%rcx) +movdqa %xmm0, 32(%rcx) +movdqa %xmm0, 48(%rcx) +movdqa %xmm0, 64(%rcx) +movdqa %xmm0, 80(%rcx) +movdqa %xmm0, 96(%rcx) +movdqa %xmm0, 112(%rcx) +testq $0x40, %rdx +jz blake2b_blocks_x86_skip64 +movdqu 0(%rsi), %xmm0 +movdqu 16(%rsi), %xmm1 +movdqu 32(%rsi), %xmm2 +movdqu 48(%rsi), %xmm3 +movdqa %xmm0, 0(%rcx) +movdqa %xmm1, 16(%rcx) +movdqa %xmm2, 32(%rcx) +movdqa %xmm3, 48(%rcx) +addq $64, %rsi +addq $64, %rcx +blake2b_blocks_x86_skip64: +testq $0x20, %rdx +jz blake2b_blocks_x86_skip32 +movdqu 0(%rsi), %xmm0 +movdqu 16(%rsi), %xmm1 +movdqa %xmm0, 0(%rcx) +movdqa %xmm1, 16(%rcx) +addq $32, %rsi +addq $32, %rcx +blake2b_blocks_x86_skip32: +testq $0x10, %rdx +jz blake2b_blocks_x86_skip16 +movdqu 0(%rsi), %xmm0 +movdqa %xmm0, 0(%rcx) +addq $16, %rsi +addq $16, %rcx +blake2b_blocks_x86_skip16: +testq $0x8, %rdx +jz blake2b_blocks_x86_skip8 +movq 0(%rsi), %rax +movq %rax, 0(%rcx) +addq $8, %rsi +addq $8, %rcx +blake2b_blocks_x86_skip8: +testq $0x4, %rdx +jz blake2b_blocks_x86_skip4 +movl 0(%rsi), %eax +movl %eax, 0(%rcx) +addq $4, %rsi +addq $4, %rcx +blake2b_blocks_x86_skip4: +testq $0x2, %rdx +jz blake2b_blocks_x86_skip2 +movw 0(%rsi), %ax +movw %ax, 0(%rcx) +addq $2, %rsi +addq $2, %rcx +blake2b_blocks_x86_skip2: +testq $0x1, %rdx +jz blake2b_blocks_x86_skip1 +movb 0(%rsi), %al +movb %al, 0(%rcx) +blake2b_blocks_x86_skip1: +leaq 0(%rsp), %rsi +blake2b_blocks_x86_not_final_call: +movq %rsi, 128(%rsp) +movq 64(%rdi), %r12 +movq 72(%rdi), %r13 +movq 80(%rdi), %r14 +movq 88(%rdi), %r15 +movabsq $0x1f83d9abfb41bd6b, %rax +movabsq $0x5be0cd19137e2179, %rbx +xorq %rax, %r14 +xorq %rbx, %r15 +movq %r12, 256(%rsp) +movq %r13, 264(%rsp) +movq %r14, 272(%rsp) +movq %r15, 280(%rsp) +movq 0(%rdi), %rax +movq 8(%rdi), %rdx +movq 16(%rdi), %r8 +movq 24(%rdi), %r12 +movq 32(%rdi), %rbx +movq 40(%rdi), %rsi +movq 48(%rdi), %r9 +movq 56(%rdi), %r13 +.p2align 6,,63 +blake2b_blocks_x86_mainloop: +movq 128(%rsp), %r10 +cmpq %r10, %rsp +je blake2b_blocks_x86_nocopy +movdqu 0(%r10), %xmm0 +movdqu 16(%r10), %xmm1 +movdqu 32(%r10), %xmm2 +movdqu 48(%r10), %xmm3 +movdqu 64(%r10), %xmm4 +movdqu 80(%r10), %xmm5 +movdqu 96(%r10), %xmm6 +movdqu 112(%r10), %xmm7 +movdqa %xmm0, 0(%rsp) +movdqa %xmm1, 16(%rsp) +movdqa %xmm2, 32(%rsp) +movdqa %xmm3, 48(%rsp) +movdqa %xmm4, 64(%rsp) +movdqa %xmm5, 80(%rsp) +movdqa %xmm6, 96(%rsp) +movdqa %xmm7, 112(%rsp) +blake2b_blocks_x86_nocopy: +movq 152(%rsp), %r10 +movq 256(%rsp), %rcx +movq 264(%rsp), %rbp +movabsq $0x510e527fade682d1, %r11 +movabsq $0x9b05688c2b3e6c1f, %r15 +addq %r10, %rcx +cmpq %r10, %rcx +jae blake2b_blocks_x86_nocountercarry +addq $1, %rbp +blake2b_blocks_x86_nocountercarry: +movq %rcx, 256(%rsp) +movq %rbp, 264(%rsp) +xorq %r11, %rcx +xorq %r15, %rbp +movabsq $0x6a09e667f3bcc908, %r11 +movabsq $0xbb67ae8584caa73b, %rdi +movabsq $0x3c6ef372fe94f82b, %r10 +movabsq $0xa54ff53a5f1d36f1, %r14 +movq %r11, 296(%rsp) +movq 272(%rsp), %r11 +movq 280(%rsp), %r15 +movq %rax, 192(%rsp) +movq %rdx, 200(%rsp) +movq %r8, 208(%rsp) +movq %r12, 216(%rsp) +movq %rbx, 224(%rsp) +movq %rsi, 232(%rsp) +movq %r9, 240(%rsp) +movq %r13, 248(%rsp) +addq 0(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 16(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 32(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 48(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 8(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 24(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 40(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 56(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 64(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 80(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 96(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 112(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 72(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 88(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 104(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 120(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 112(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 32(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 72(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 104(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 80(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 64(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 120(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 48(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 8(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 0(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 88(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 40(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 96(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 16(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 56(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 24(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 88(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 96(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 40(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 120(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 64(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 0(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 16(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 104(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 80(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 24(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 56(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 72(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 112(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 48(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 8(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 32(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 56(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 24(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 104(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 88(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 72(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 8(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 96(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 112(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 16(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 40(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 32(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 120(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 48(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 80(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 0(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 64(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 72(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 40(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 16(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 80(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 0(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 56(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 32(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 120(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 112(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 88(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 48(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 24(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 8(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 96(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 64(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 104(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 16(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 48(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 0(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 64(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 96(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 80(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 88(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 24(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 32(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 56(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 120(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 8(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 104(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 40(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 112(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 72(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 96(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 8(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 112(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 32(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 40(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 120(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 104(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 80(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 0(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 48(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 72(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 64(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 56(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 24(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 16(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 88(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 104(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 56(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 96(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 24(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 88(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 112(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 8(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 72(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 40(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 120(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 64(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 16(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 0(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 32(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 48(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 80(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 48(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 112(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 88(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 0(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 120(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 72(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 24(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 64(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 96(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 104(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 8(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 80(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 16(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 56(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 32(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 40(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 80(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 64(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 56(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 8(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 16(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 32(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 48(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 40(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 120(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 72(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 24(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 104(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 88(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 112(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 96(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 0(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 0(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 16(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 32(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 48(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 8(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 24(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 40(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 56(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 64(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 80(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 96(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 112(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 72(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 88(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 104(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 120(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +addq 112(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $32, %rcx +addq 32(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $32, %rbp +addq 72(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $32, %r11 +addq 104(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $32, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $40, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $40, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $40, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $40, %r13 +addq 80(%rsp), %rax +addq %rbx, %rax +xorq %rax, %rcx +rolq $48, %rcx +addq 64(%rsp), %rdx +addq %rsi, %rdx +xorq %rdx, %rbp +rolq $48, %rbp +addq 120(%rsp), %r8 +addq %r9, %r8 +movq %rax, 288(%rsp) +xorq %r8, %r11 +rolq $48, %r11 +addq 48(%rsp), %r12 +addq %r13, %r12 +xorq %r12, %r15 +rolq $48, %r15 +movq 296(%rsp), %rax +addq %rcx, %rax +xorq %rax, %rbx +movq %rax, 296(%rsp) +rolq $1, %rbx +addq %rbp, %rdi +xorq %rdi, %rsi +rolq $1, %rsi +addq %r11, %r10 +xorq %r10, %r9 +rolq $1, %r9 +movq 288(%rsp), %rax +addq %r15, %r14 +xorq %r14, %r13 +rolq $1, %r13 +addq 8(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $32, %r15 +addq 0(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $32, %rcx +addq 88(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $32, %rbp +addq 40(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $32, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $40, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $40, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $40, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $40, %rbx +addq 96(%rsp), %rax +addq %rsi, %rax +xorq %rax, %r15 +rolq $48, %r15 +addq 16(%rsp), %rdx +addq %r9, %rdx +xorq %rdx, %rcx +rolq $48, %rcx +addq 56(%rsp), %r8 +addq %r13, %r8 +movq %rax, 288(%rsp) +xorq %r8, %rbp +rolq $48, %rbp +addq 24(%rsp), %r12 +addq %rbx, %r12 +xorq %r12, %r11 +rolq $48, %r11 +addq %r15, %r10 +xorq %r10, %rsi +rolq $1, %rsi +addq %rcx, %r14 +xorq %r14, %r9 +rolq $1, %r9 +movq 296(%rsp), %rax +addq %rbp, %rax +xorq %rax, %r13 +movq %rax, 296(%rsp) +rolq $1, %r13 +movq 288(%rsp), %rax +addq %r11, %rdi +xorq %rdi, %rbx +rolq $1, %rbx +xorq 296(%rsp), %rax +xorq %rdi, %rdx +xorq %r10, %r8 +xorq %r14, %r12 +xorq %rcx, %rbx +xorq %rbp, %rsi +xorq %r11, %r9 +xorq %r15, %r13 +xorq 192(%rsp), %rax +xorq 200(%rsp), %rdx +xorq 208(%rsp), %r8 +xorq 216(%rsp), %r12 +xorq 224(%rsp), %rbx +xorq 232(%rsp), %rsi +xorq 240(%rsp), %r9 +xorq 248(%rsp), %r13 +movq 128(%rsp), %rcx +movq 136(%rsp), %rbp +movq 144(%rsp), %r11 +cmpq $128, %rbp +jbe blake2b_blocks_x86_done +addq %r11, %rcx +subq $128, %rbp +movq %rcx, 128(%rsp) +movq %rbp, 136(%rsp) +jmp blake2b_blocks_x86_mainloop +blake2b_blocks_x86_done: +movq 160(%rsp), %rcx +movq 256(%rsp), %rbp +movq 264(%rsp), %r11 +movq %rax, 0(%rcx) +movq %rdx, 8(%rcx) +movq %r8, 16(%rcx) +movq %r12, 24(%rcx) +movq %rbx, 32(%rcx) +movq %rsi, 40(%rcx) +movq %r9, 48(%rcx) +movq %r13, 56(%rcx) +movq %rbp, 64(%rcx) +movq %r11, 72(%rcx) +movq 168(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbp +popq %rbx +ret +FN_END blake2b_blocks_x86 |