aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libcryptobox/CMakeLists.txt3
-rw-r--r--src/libcryptobox/blake2/avx.S689
-rw-r--r--src/libcryptobox/blake2/blake2.c13
-rw-r--r--src/libcryptobox/blake2/constants.S30
-rw-r--r--src/libcryptobox/blake2/ref.c2
-rw-r--r--src/libcryptobox/blake2/x86-32.S1080
-rw-r--r--src/libcryptobox/blake2/x86-64.S1754
7 files changed, 3558 insertions, 13 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt
index 9cb7ecbe7..171a4d0f9 100644
--- a/src/libcryptobox/CMakeLists.txt
+++ b/src/libcryptobox/CMakeLists.txt
@@ -53,9 +53,11 @@ IF("${ARCH}" STREQUAL "x86_64")
SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/curve25519-donna.c)
endif()
+ SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/x86-64.S)
ELSEIF("${ARCH}" STREQUAL "i386")
SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/ref-32.c)
SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/curve25519-donna.c)
+ SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/x86-32.S)
ELSE()
SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/ref-32.c)
ENDIF()
@@ -69,6 +71,7 @@ IF(HAVE_AVX)
SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx.S)
SET(CURVESRC ${CURVESRC} ${CMAKE_CURRENT_SOURCE_DIR}/curve25519/avx.S
${CMAKE_CURRENT_SOURCE_DIR}/curve25519/avx.c)
+ SET(BLAKE2SRC ${BLAKE2SRC} ${CMAKE_CURRENT_SOURCE_DIR}/blake2/avx.S)
ENDIF(HAVE_AVX)
IF(HAVE_SSE2)
SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/sse2.S)
diff --git a/src/libcryptobox/blake2/avx.S b/src/libcryptobox/blake2/avx.S
new file mode 100644
index 000000000..e569f0ba7
--- /dev/null
+++ b/src/libcryptobox/blake2/avx.S
@@ -0,0 +1,689 @@
+#include "../macro.S"
+#include "constants.S"
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN_EXT blake2b_blocks_avx, 4, 16
+pushq %rbp
+movq %rsp, %rbp
+andq $-64, %rsp
+pushq %r12
+pushq %r13
+pushq %r14
+pushq %r15
+pushq %rbx
+subq $344, %rsp
+LOAD_VAR_PIC 48+blake2b_constants, %r9
+LOAD_VAR_PIC blake2b_constants_ssse3, %rax
+leaq 16(%rax), %r8
+vmovdqu 80(%rdi), %xmm0
+cmpq $128, %rdx
+vpxor (%r9), %xmm0, %xmm0
+movl $128, %r9d
+vmovdqu (%rax), %xmm12
+cmovbe %rdx, %r9
+vmovdqu (%r8), %xmm1
+movq 64(%rdi), %r8
+movq 72(%rdi), %rax
+cmpq $0, 80(%rdi)
+je blake2b_blocks_avx_L21
+blake2b_blocks_avx_L2:
+cmpq $128, %rdx
+je blake2b_blocks_avx_L21
+blake2b_blocks_avx_L3:
+lea (%rsp), %r10
+testq $64, %rdx
+je blake2b_blocks_avx_L5
+blake2b_blocks_avx_L4:
+vmovdqu (%rsi), %xmm2
+vmovdqu %xmm2, (%rsp)
+lea 64(%rsp), %r10
+vmovdqu 16(%rsi), %xmm3
+vmovdqu %xmm3, 16(%rsp)
+vpxor %xmm2, %xmm2, %xmm2
+vmovdqu 32(%rsi), %xmm4
+vmovdqu %xmm4, 32(%rsp)
+vmovdqu 48(%rsi), %xmm5
+vmovdqu %xmm5, 48(%rsp)
+addq $64, %rsi
+jmp blake2b_blocks_avx_L6
+blake2b_blocks_avx_L5:
+vpxor %xmm2, %xmm2, %xmm2
+vmovdqu %xmm2, 64(%rsp)
+vmovdqu %xmm2, 80(%rsp)
+vmovdqu %xmm2, 96(%rsp)
+vmovdqu %xmm2, 112(%rsp)
+blake2b_blocks_avx_L6:
+vmovdqu %xmm2, (%r10)
+vmovdqu %xmm2, 16(%r10)
+vmovdqu %xmm2, 32(%r10)
+vmovdqu %xmm2, 48(%r10)
+testq $32, %rdx
+je blake2b_blocks_avx_L8
+blake2b_blocks_avx_L7:
+vmovdqu (%rsi), %xmm2
+vmovdqu %xmm2, (%r10)
+vmovdqu 16(%rsi), %xmm3
+vmovdqu %xmm3, 16(%r10)
+addq $32, %rsi
+addq $32, %r10
+blake2b_blocks_avx_L8:
+testq $16, %rdx
+je blake2b_blocks_avx_L10
+blake2b_blocks_avx_L9:
+vmovdqu (%rsi), %xmm2
+vmovdqu %xmm2, (%r10)
+addq $16, %rsi
+addq $16, %r10
+blake2b_blocks_avx_L10:
+testq $8, %rdx
+je blake2b_blocks_avx_L12
+blake2b_blocks_avx_L11:
+movq (%rsi), %r11
+addq $8, %rsi
+movq %r11, (%r10)
+addq $8, %r10
+blake2b_blocks_avx_L12:
+testq $4, %rdx
+je blake2b_blocks_avx_L14
+blake2b_blocks_avx_L13:
+movl (%rsi), %r11d
+addq $4, %rsi
+movl %r11d, (%r10)
+addq $4, %r10
+blake2b_blocks_avx_L14:
+testq $2, %rdx
+je blake2b_blocks_avx_L16
+blake2b_blocks_avx_L15:
+movzwl (%rsi), %r11d
+addq $2, %rsi
+movw %r11w, (%r10)
+addq $2, %r10
+blake2b_blocks_avx_L16:
+testq $1, %rdx
+je blake2b_blocks_avx_L18
+blake2b_blocks_avx_L17:
+movb (%rsi), %sil
+movb %sil, (%r10)
+blake2b_blocks_avx_L18:
+lea (%rsp), %rsi
+blake2b_blocks_avx_L21:
+LOAD_VAR_PIC 32+blake2b_constants, %r10
+LOAD_VAR_PIC blake2b_constants, %r11
+vmovdqu (%rdi), %xmm5
+vmovdqu 16(%rdi), %xmm6
+vmovdqu 32(%rdi), %xmm7
+vmovdqu (%r10), %xmm4
+LOAD_VAR_PIC 16+blake2b_constants, %r10
+vmovdqu 48(%rdi), %xmm8
+vmovdqu (%r11), %xmm3
+vmovdqu %xmm3, 176(%rsp)
+vmovdqu (%r10), %xmm2
+vmovdqu %xmm2, 160(%rsp)
+vmovdqu %xmm4, 144(%rsp)
+vmovdqu %xmm8, 240(%rsp)
+vmovdqu %xmm7, 256(%rsp)
+vmovdqu %xmm6, 224(%rsp)
+vmovdqu %xmm5, 208(%rsp)
+vmovdqu %xmm0, 192(%rsp)
+movq %r9, 272(%rsp)
+movq %rdi, 128(%rsp)
+movq %rcx, 136(%rsp)
+jmp blake2b_blocks_avx_L22
+# align to 31 mod 64
+.p2align 6
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+blake2b_blocks_avx_L25:
+addq 136(%rsp), %rsi
+addq $-128, %rdx
+blake2b_blocks_avx_L22:
+movq 272(%rsp), %rcx
+addq %rcx, %r8
+cmpq %rcx, %r8
+lea 1(%rax), %rbx
+vmovdqu (%rsi), %xmm15
+vmovdqu 16(%rsi), %xmm5
+vmovdqu 32(%rsi), %xmm3
+vmovdqu 48(%rsi), %xmm6
+cmovb %rbx, %rax
+vmovd %r8, %xmm7
+vpunpcklqdq %xmm5, %xmm15, %xmm2
+LOAD_VAR_PIC 96+blake2b_constants, %rcx
+vpunpcklqdq %xmm6, %xmm3, %xmm8
+LOAD_VAR_PIC 224+blake2b_constants, %rbx
+vpaddq 208(%rsp), %xmm2, %xmm0
+vpaddq 224(%rsp), %xmm8, %xmm10
+vmovd %rax, %xmm14
+vmovdqu 256(%rsp), %xmm4
+vmovdqu 240(%rsp), %xmm11
+vpunpcklqdq %xmm14, %xmm7, %xmm9
+vpaddq %xmm4, %xmm0, %xmm13
+vpaddq %xmm11, %xmm10, %xmm2
+vpxor 144(%rsp), %xmm9, %xmm0
+vpxor 192(%rsp), %xmm2, %xmm10
+vpxor %xmm13, %xmm0, %xmm8
+vpshufd $177, %xmm8, %xmm8
+vpshufd $177, %xmm10, %xmm7
+vpaddq 176(%rsp), %xmm8, %xmm14
+vpaddq 160(%rsp), %xmm7, %xmm9
+vpxor %xmm14, %xmm4, %xmm4
+vpxor %xmm9, %xmm11, %xmm11
+vpshufb %xmm1, %xmm4, %xmm4
+vpshufb %xmm1, %xmm11, %xmm0
+vpunpckhqdq %xmm5, %xmm15, %xmm15
+vpunpckhqdq %xmm6, %xmm3, %xmm6
+vpaddq %xmm15, %xmm13, %xmm13
+vpaddq %xmm6, %xmm2, %xmm6
+vpaddq %xmm4, %xmm13, %xmm10
+vpaddq %xmm0, %xmm6, %xmm15
+vpxor %xmm10, %xmm8, %xmm2
+vpxor %xmm15, %xmm7, %xmm8
+vpshufb %xmm12, %xmm2, %xmm5
+vpshufb %xmm12, %xmm8, %xmm2
+vpaddq %xmm5, %xmm14, %xmm6
+vpaddq %xmm2, %xmm9, %xmm7
+vpxor %xmm6, %xmm4, %xmm4
+vpxor %xmm7, %xmm0, %xmm9
+vpaddq %xmm4, %xmm4, %xmm14
+vpaddq %xmm9, %xmm9, %xmm13
+vpsrlq $63, %xmm4, %xmm0
+vpsrlq $63, %xmm9, %xmm11
+vpor %xmm14, %xmm0, %xmm8
+vpor %xmm13, %xmm11, %xmm4
+vpalignr $8, %xmm8, %xmm4, %xmm0
+vpalignr $8, %xmm4, %xmm8, %xmm14
+vmovdqu 64(%rsi), %xmm9
+vmovdqu 80(%rsi), %xmm8
+vmovdqu 96(%rsi), %xmm4
+vpunpcklqdq %xmm8, %xmm9, %xmm11
+vpaddq %xmm11, %xmm10, %xmm10
+vmovdqu 112(%rsi), %xmm11
+vpaddq %xmm0, %xmm10, %xmm13
+vpunpcklqdq %xmm11, %xmm4, %xmm10
+vpaddq %xmm10, %xmm15, %xmm15
+vpaddq %xmm14, %xmm15, %xmm15
+vpalignr $8, %xmm2, %xmm5, %xmm10
+vpalignr $8, %xmm5, %xmm2, %xmm5
+vpxor %xmm13, %xmm10, %xmm10
+vpxor %xmm15, %xmm5, %xmm2
+vpshufd $177, %xmm10, %xmm10
+vpshufd $177, %xmm2, %xmm2
+vpaddq %xmm10, %xmm7, %xmm7
+vpaddq %xmm2, %xmm6, %xmm5
+vpxor %xmm7, %xmm0, %xmm6
+vpxor %xmm5, %xmm14, %xmm14
+vpshufb %xmm1, %xmm6, %xmm0
+vpshufb %xmm1, %xmm14, %xmm6
+vpunpckhqdq %xmm8, %xmm9, %xmm14
+vpaddq %xmm14, %xmm13, %xmm13
+vpaddq %xmm0, %xmm13, %xmm14
+vpunpckhqdq %xmm11, %xmm4, %xmm13
+vpxor %xmm14, %xmm10, %xmm10
+vpaddq %xmm13, %xmm15, %xmm15
+vpshufb %xmm12, %xmm10, %xmm13
+vpaddq %xmm6, %xmm15, %xmm15
+vpaddq %xmm13, %xmm7, %xmm10
+vpxor %xmm15, %xmm2, %xmm2
+vpxor %xmm10, %xmm0, %xmm0
+vpshufb %xmm12, %xmm2, %xmm2
+vpaddq %xmm2, %xmm5, %xmm5
+vpxor %xmm5, %xmm6, %xmm7
+vpsrlq $63, %xmm0, %xmm6
+vpaddq %xmm0, %xmm0, %xmm0
+vpor %xmm0, %xmm6, %xmm6
+vpsrlq $63, %xmm7, %xmm0
+vpaddq %xmm7, %xmm7, %xmm7
+vpor %xmm7, %xmm0, %xmm0
+vpalignr $8, %xmm0, %xmm6, %xmm7
+vpalignr $8, %xmm6, %xmm0, %xmm6
+vpunpcklqdq %xmm3, %xmm11, %xmm0
+vpaddq %xmm0, %xmm14, %xmm14
+vpaddq %xmm7, %xmm14, %xmm0
+vpunpckhqdq %xmm4, %xmm9, %xmm14
+vpaddq %xmm14, %xmm15, %xmm15
+vpaddq %xmm6, %xmm15, %xmm14
+vpalignr $8, %xmm13, %xmm2, %xmm15
+vpxor %xmm0, %xmm15, %xmm15
+vpshufd $177, %xmm15, %xmm15
+vpalignr $8, %xmm2, %xmm13, %xmm2
+vpxor %xmm14, %xmm2, %xmm13
+vpaddq %xmm15, %xmm5, %xmm2
+vpshufd $177, %xmm13, %xmm13
+vpxor %xmm2, %xmm7, %xmm5
+vpunpcklqdq %xmm9, %xmm8, %xmm7
+vpaddq %xmm13, %xmm10, %xmm10
+vpaddq %xmm7, %xmm0, %xmm9
+vmovdqu 48(%rsi), %xmm0
+vpshufb %xmm1, %xmm5, %xmm5
+vpxor %xmm10, %xmm6, %xmm6
+vpshufb %xmm1, %xmm6, %xmm6
+vpaddq %xmm5, %xmm9, %xmm9
+vpalignr $8, %xmm11, %xmm0, %xmm11
+vpxor %xmm9, %xmm15, %xmm15
+vpaddq %xmm11, %xmm14, %xmm7
+vpshufb %xmm12, %xmm15, %xmm11
+vpaddq %xmm6, %xmm7, %xmm14
+vpaddq %xmm11, %xmm2, %xmm2
+vpxor %xmm14, %xmm13, %xmm13
+vpxor %xmm2, %xmm5, %xmm5
+vpshufb %xmm12, %xmm13, %xmm13
+vpaddq %xmm13, %xmm10, %xmm10
+vpxor %xmm10, %xmm6, %xmm15
+vpsrlq $63, %xmm5, %xmm6
+vpaddq %xmm5, %xmm5, %xmm5
+vpsrlq $63, %xmm15, %xmm7
+vpor %xmm5, %xmm6, %xmm6
+vpaddq %xmm15, %xmm15, %xmm15
+vpor %xmm15, %xmm7, %xmm5
+vpalignr $8, %xmm6, %xmm5, %xmm15
+vpalignr $8, %xmm5, %xmm6, %xmm5
+vpshufd $78, (%rsi), %xmm6
+vpaddq %xmm6, %xmm9, %xmm9
+vpunpckhqdq %xmm3, %xmm8, %xmm3
+vpaddq %xmm3, %xmm14, %xmm6
+vpaddq %xmm15, %xmm9, %xmm9
+vpaddq %xmm5, %xmm6, %xmm8
+vpalignr $8, %xmm13, %xmm11, %xmm3
+vpalignr $8, %xmm11, %xmm13, %xmm11
+vpxor %xmm9, %xmm3, %xmm7
+vpshufd $177, %xmm7, %xmm14
+vpxor %xmm8, %xmm11, %xmm13
+vpshufd $177, %xmm13, %xmm3
+vpaddq %xmm14, %xmm10, %xmm6
+vpaddq %xmm3, %xmm2, %xmm10
+vpxor %xmm6, %xmm15, %xmm2
+vmovdqu 16(%rsi), %xmm15
+vpshufb %xmm1, %xmm2, %xmm7
+vpxor %xmm10, %xmm5, %xmm2
+vpshufb %xmm1, %xmm2, %xmm5
+vpunpcklqdq %xmm15, %xmm4, %xmm4
+vpunpckhqdq %xmm15, %xmm0, %xmm0
+vpaddq %xmm4, %xmm9, %xmm2
+vpaddq %xmm0, %xmm8, %xmm8
+vpaddq %xmm7, %xmm2, %xmm2
+vpaddq %xmm5, %xmm8, %xmm0
+vpxor %xmm2, %xmm14, %xmm15
+vpxor %xmm0, %xmm3, %xmm9
+vpshufb %xmm12, %xmm15, %xmm15
+vpshufb %xmm12, %xmm9, %xmm3
+vpaddq %xmm15, %xmm6, %xmm8
+vpaddq %xmm3, %xmm10, %xmm6
+vpxor %xmm8, %xmm7, %xmm10
+vpxor %xmm6, %xmm5, %xmm5
+vpaddq %xmm5, %xmm5, %xmm9
+vpsrlq $63, %xmm10, %xmm4
+vpsrlq $63, %xmm5, %xmm7
+vpaddq %xmm10, %xmm10, %xmm10
+vpor %xmm10, %xmm4, %xmm13
+vpor %xmm9, %xmm7, %xmm11
+vpalignr $8, %xmm11, %xmm13, %xmm4
+vpalignr $8, %xmm13, %xmm11, %xmm7
+vpalignr $8, %xmm15, %xmm3, %xmm9
+vpalignr $8, %xmm3, %xmm15, %xmm10
+blake2b_blocks_avx_L23:
+movzbl (%rcx), %edi
+movzbl 2(%rcx), %r9d
+movzbl 4(%rcx), %r10d
+movzbl 6(%rcx), %r11d
+vmovq (%rdi,%rsi), %xmm5
+vpinsrq $1, (%r9,%rsi), %xmm5, %xmm14
+vmovq (%r10,%rsi), %xmm3
+vpinsrq $1, (%r11,%rsi), %xmm3, %xmm15
+vpaddq %xmm14, %xmm2, %xmm2
+vpaddq %xmm15, %xmm0, %xmm0
+vpaddq %xmm4, %xmm2, %xmm2
+vpaddq %xmm7, %xmm0, %xmm0
+vpxor %xmm2, %xmm9, %xmm11
+vpxor %xmm0, %xmm10, %xmm10
+vpshufd $177, %xmm11, %xmm3
+movzbl 1(%rcx), %r12d
+movzbl 5(%rcx), %r14d
+vpshufd $177, %xmm10, %xmm5
+vpaddq %xmm3, %xmm6, %xmm6
+vpaddq %xmm5, %xmm8, %xmm9
+movzbl 3(%rcx), %r13d
+vpxor %xmm6, %xmm4, %xmm14
+movzbl 7(%rcx), %r15d
+vpxor %xmm9, %xmm7, %xmm15
+vmovq (%r12,%rsi), %xmm4
+vmovq (%r14,%rsi), %xmm11
+vpinsrq $1, (%r13,%rsi), %xmm4, %xmm7
+vpinsrq $1, (%r15,%rsi), %xmm11, %xmm13
+vpshufb %xmm1, %xmm14, %xmm8
+vpshufb %xmm1, %xmm15, %xmm14
+vpaddq %xmm7, %xmm2, %xmm2
+vpaddq %xmm13, %xmm0, %xmm0
+vpaddq %xmm8, %xmm2, %xmm4
+vpaddq %xmm14, %xmm0, %xmm7
+vpxor %xmm4, %xmm3, %xmm10
+vpxor %xmm7, %xmm5, %xmm3
+vpshufb %xmm12, %xmm10, %xmm11
+vpshufb %xmm12, %xmm3, %xmm10
+vpaddq %xmm11, %xmm6, %xmm13
+vpaddq %xmm10, %xmm9, %xmm9
+movzbl 8(%rcx), %edi
+vpxor %xmm13, %xmm8, %xmm8
+movzbl 12(%rcx), %r10d
+vpxor %xmm9, %xmm14, %xmm2
+movzbl 10(%rcx), %r9d
+vpsrlq $63, %xmm8, %xmm6
+movzbl 14(%rcx), %r11d
+vpsrlq $63, %xmm2, %xmm0
+vpaddq %xmm8, %xmm8, %xmm5
+vpaddq %xmm2, %xmm2, %xmm14
+vmovq (%rdi,%rsi), %xmm15
+vpor %xmm5, %xmm6, %xmm8
+vmovq (%r10,%rsi), %xmm3
+vpor %xmm14, %xmm0, %xmm6
+vpinsrq $1, (%r9,%rsi), %xmm15, %xmm5
+vpinsrq $1, (%r11,%rsi), %xmm3, %xmm0
+vpalignr $8, %xmm8, %xmm6, %xmm2
+vpalignr $8, %xmm6, %xmm8, %xmm14
+vpalignr $8, %xmm10, %xmm11, %xmm8
+vpalignr $8, %xmm11, %xmm10, %xmm11
+vpaddq %xmm5, %xmm4, %xmm4
+vpaddq %xmm0, %xmm7, %xmm7
+vpaddq %xmm2, %xmm4, %xmm15
+vpaddq %xmm14, %xmm7, %xmm0
+vpxor %xmm15, %xmm8, %xmm6
+vpxor %xmm0, %xmm11, %xmm10
+vpshufd $177, %xmm6, %xmm6
+vpshufd $177, %xmm10, %xmm8
+movzbl 9(%rcx), %r12d
+movzbl 13(%rcx), %r14d
+vpaddq %xmm6, %xmm9, %xmm4
+vpaddq %xmm8, %xmm13, %xmm7
+movzbl 11(%rcx), %r13d
+vpxor %xmm4, %xmm2, %xmm9
+movzbl 15(%rcx), %r15d
+vpxor %xmm7, %xmm14, %xmm2
+vmovq (%r12,%rsi), %xmm14
+addq $16, %rcx
+vmovq (%r14,%rsi), %xmm3
+vpshufb %xmm1, %xmm9, %xmm13
+vpinsrq $1, (%r13,%rsi), %xmm14, %xmm5
+vpinsrq $1, (%r15,%rsi), %xmm3, %xmm9
+vpshufb %xmm1, %xmm2, %xmm11
+vpaddq %xmm5, %xmm15, %xmm15
+vpaddq %xmm9, %xmm0, %xmm0
+vpaddq %xmm13, %xmm15, %xmm2
+vpaddq %xmm11, %xmm0, %xmm0
+vpxor %xmm2, %xmm6, %xmm6
+vpxor %xmm0, %xmm8, %xmm8
+vpshufb %xmm12, %xmm6, %xmm14
+vpshufb %xmm12, %xmm8, %xmm15
+vpaddq %xmm14, %xmm4, %xmm8
+vpaddq %xmm15, %xmm7, %xmm6
+vpxor %xmm8, %xmm13, %xmm4
+vpxor %xmm6, %xmm11, %xmm11
+vpaddq %xmm4, %xmm4, %xmm10
+vpsrlq $63, %xmm4, %xmm7
+vpsrlq $63, %xmm11, %xmm13
+vpaddq %xmm11, %xmm11, %xmm4
+vpor %xmm10, %xmm7, %xmm3
+vpor %xmm4, %xmm13, %xmm11
+vpalignr $8, %xmm11, %xmm3, %xmm4
+vpalignr $8, %xmm3, %xmm11, %xmm7
+vpalignr $8, %xmm15, %xmm14, %xmm10
+vpalignr $8, %xmm14, %xmm15, %xmm9
+cmpq %rbx, %rcx
+jb blake2b_blocks_avx_L23
+blake2b_blocks_avx_L24:
+movq 32(%rsi), %r13
+movq (%rsi), %r10
+movq 48(%rsi), %r9
+vmovd %r13, %xmm13
+vpinsrq $1, %r9, %xmm13, %xmm14
+vmovd %r10, %xmm3
+movq 16(%rsi), %rbx
+vpinsrq $1, %rbx, %xmm3, %xmm15
+vpaddq %xmm14, %xmm0, %xmm0
+vpaddq %xmm7, %xmm0, %xmm3
+vpxor %xmm3, %xmm10, %xmm10
+vpaddq %xmm15, %xmm2, %xmm2
+vpaddq %xmm4, %xmm2, %xmm5
+vpshufd $177, %xmm10, %xmm15
+vpxor %xmm5, %xmm9, %xmm9
+vpshufd $177, %xmm9, %xmm9
+vpaddq %xmm15, %xmm8, %xmm14
+vpaddq %xmm9, %xmm6, %xmm0
+vpxor %xmm14, %xmm7, %xmm7
+vpxor %xmm0, %xmm4, %xmm8
+vpshufb %xmm1, %xmm7, %xmm4
+vpshufb %xmm1, %xmm8, %xmm2
+vmovq 8(%rsi), %xmm7
+movq %r8, 288(%rsp)
+movq 24(%rsi), %r8
+vpinsrq $1, %r8, %xmm7, %xmm6
+vpinsrq $1, %r10, %xmm7, %xmm7
+vpaddq %xmm6, %xmm5, %xmm13
+movq 40(%rsi), %rcx
+movq 56(%rsi), %rdi
+vpaddq %xmm2, %xmm13, %xmm13
+vmovd %rcx, %xmm5
+vpxor %xmm13, %xmm9, %xmm9
+vpinsrq $1, %rdi, %xmm5, %xmm10
+vpshufb %xmm12, %xmm9, %xmm5
+vpaddq %xmm10, %xmm3, %xmm3
+vpaddq %xmm4, %xmm3, %xmm11
+vpaddq %xmm5, %xmm0, %xmm3
+vpxor %xmm11, %xmm15, %xmm8
+vpshufb %xmm12, %xmm8, %xmm10
+vpaddq %xmm10, %xmm14, %xmm8
+vpxor %xmm3, %xmm2, %xmm14
+vpxor %xmm8, %xmm4, %xmm9
+vpsrlq $63, %xmm14, %xmm4
+vpsrlq $63, %xmm9, %xmm0
+vpaddq %xmm14, %xmm14, %xmm14
+movq 64(%rsi), %r15
+vpor %xmm14, %xmm4, %xmm6
+vpaddq %xmm9, %xmm9, %xmm4
+vmovq 96(%rsi), %xmm9
+vpor %xmm4, %xmm0, %xmm2
+movq 112(%rsi), %r14
+vmovd %r15, %xmm15
+vpinsrq $1, %r14, %xmm9, %xmm0
+vpinsrq $1, %rbx, %xmm9, %xmm9
+vpalignr $8, %xmm6, %xmm2, %xmm4
+vpalignr $8, %xmm2, %xmm6, %xmm2
+vpaddq %xmm0, %xmm11, %xmm11
+movq 80(%rsi), %r11
+vpinsrq $1, %r11, %xmm15, %xmm14
+vpaddq %xmm2, %xmm11, %xmm11
+vpalignr $8, %xmm10, %xmm5, %xmm15
+vpalignr $8, %xmm5, %xmm10, %xmm5
+vpxor %xmm11, %xmm5, %xmm10
+vpaddq %xmm14, %xmm13, %xmm13
+vpaddq %xmm4, %xmm13, %xmm6
+vpshufd $177, %xmm10, %xmm14
+vpxor %xmm6, %xmm15, %xmm13
+vpaddq %xmm14, %xmm3, %xmm0
+vpshufd $177, %xmm13, %xmm13
+vpaddq %xmm13, %xmm8, %xmm15
+vpxor %xmm0, %xmm2, %xmm8
+vpxor %xmm15, %xmm4, %xmm3
+vpshufb %xmm1, %xmm8, %xmm5
+vpshufb %xmm1, %xmm3, %xmm4
+vmovq 72(%rsi), %xmm8
+movq %rax, 296(%rsp)
+movq 88(%rsi), %rax
+vpinsrq $1, %rax, %xmm8, %xmm2
+movq 104(%rsi), %r12
+vpaddq %xmm2, %xmm6, %xmm6
+vpinsrq $1, %r12, %xmm8, %xmm8
+vmovd %r12, %xmm3
+vpaddq %xmm4, %xmm6, %xmm10
+vpxor %xmm10, %xmm13, %xmm13
+movq %rsi, 280(%rsp)
+movq 120(%rsi), %rsi
+vpinsrq $1, %rsi, %xmm3, %xmm6
+vpshufb %xmm12, %xmm13, %xmm3
+vpaddq %xmm6, %xmm11, %xmm11
+vpaddq %xmm5, %xmm11, %xmm6
+vpxor %xmm6, %xmm14, %xmm14
+vpshufb %xmm12, %xmm14, %xmm2
+vpaddq %xmm3, %xmm15, %xmm14
+vpaddq %xmm2, %xmm0, %xmm0
+vpaddq %xmm8, %xmm6, %xmm6
+vpxor %xmm14, %xmm4, %xmm4
+vpxor %xmm0, %xmm5, %xmm13
+vpsrlq $63, %xmm4, %xmm5
+vpsrlq $63, %xmm13, %xmm15
+vpaddq %xmm4, %xmm4, %xmm4
+vpaddq %xmm13, %xmm13, %xmm13
+vpor %xmm4, %xmm5, %xmm11
+vpor %xmm13, %xmm15, %xmm5
+vpalignr $8, %xmm5, %xmm11, %xmm15
+vmovd %r11, %xmm4
+vpalignr $8, %xmm11, %xmm5, %xmm5
+vmovd %r14, %xmm11
+vpinsrq $1, %r13, %xmm11, %xmm13
+vpinsrq $1, %r15, %xmm4, %xmm11
+vpaddq %xmm5, %xmm6, %xmm6
+vpaddq %xmm13, %xmm10, %xmm10
+vpaddq %xmm15, %xmm10, %xmm10
+vpalignr $8, %xmm3, %xmm2, %xmm13
+vpxor %xmm10, %xmm13, %xmm8
+vmovd %rsi, %xmm13
+vpshufd $177, %xmm8, %xmm8
+vpalignr $8, %xmm2, %xmm3, %xmm3
+vpxor %xmm6, %xmm3, %xmm2
+vpaddq %xmm8, %xmm0, %xmm3
+vpaddq %xmm11, %xmm10, %xmm10
+vpxor %xmm3, %xmm15, %xmm0
+vpshufd $177, %xmm2, %xmm2
+vpshufb %xmm1, %xmm0, %xmm0
+vpaddq %xmm2, %xmm14, %xmm14
+vpxor %xmm14, %xmm5, %xmm5
+vpshufb %xmm1, %xmm5, %xmm15
+vpaddq %xmm0, %xmm10, %xmm5
+vpinsrq $1, %r9, %xmm13, %xmm10
+vpaddq %xmm10, %xmm6, %xmm6
+vpaddq %xmm15, %xmm6, %xmm13
+vpxor %xmm5, %xmm8, %xmm10
+vpxor %xmm13, %xmm2, %xmm8
+vpshufb %xmm12, %xmm10, %xmm4
+vpshufb %xmm12, %xmm8, %xmm6
+vpaddq %xmm4, %xmm3, %xmm8
+vpaddq %xmm6, %xmm14, %xmm2
+vpxor %xmm8, %xmm0, %xmm14
+vpxor %xmm2, %xmm15, %xmm15
+vpaddq %xmm14, %xmm14, %xmm0
+vpsrlq $63, %xmm14, %xmm3
+vpsrlq $63, %xmm15, %xmm14
+vpor %xmm0, %xmm3, %xmm10
+vpaddq %xmm15, %xmm15, %xmm3
+vpor %xmm3, %xmm14, %xmm0
+vpaddq %xmm7, %xmm5, %xmm14
+vpalignr $8, %xmm10, %xmm0, %xmm11
+vmovd %rax, %xmm5
+vpaddq %xmm11, %xmm14, %xmm7
+vpinsrq $1, %rcx, %xmm5, %xmm14
+vpalignr $8, %xmm0, %xmm10, %xmm15
+vpaddq %xmm9, %xmm7, %xmm3
+vmovd %rdi, %xmm9
+vpinsrq $1, %r8, %xmm9, %xmm10
+vpaddq %xmm14, %xmm13, %xmm13
+vpaddq %xmm15, %xmm13, %xmm5
+vpalignr $8, %xmm6, %xmm4, %xmm13
+vpalignr $8, %xmm4, %xmm6, %xmm4
+vpxor %xmm7, %xmm13, %xmm14
+vpxor %xmm5, %xmm4, %xmm6
+vpshufd $177, %xmm14, %xmm13
+vpshufd $177, %xmm6, %xmm14
+vpaddq %xmm13, %xmm2, %xmm6
+vpaddq %xmm14, %xmm8, %xmm4
+vpaddq %xmm10, %xmm5, %xmm5
+vpxor %xmm6, %xmm11, %xmm2
+vpxor %xmm4, %xmm15, %xmm8
+vpshufb %xmm1, %xmm2, %xmm2
+vpshufb %xmm1, %xmm8, %xmm8
+vpaddq %xmm2, %xmm3, %xmm7
+vpaddq %xmm8, %xmm5, %xmm5
+vpxor %xmm7, %xmm13, %xmm13
+vpxor %xmm5, %xmm14, %xmm14
+vpshufb %xmm12, %xmm13, %xmm13
+vpshufb %xmm12, %xmm14, %xmm14
+vpaddq %xmm13, %xmm6, %xmm10
+vpaddq %xmm14, %xmm4, %xmm0
+vpxor %xmm10, %xmm2, %xmm2
+vpxor %xmm0, %xmm8, %xmm8
+vpaddq %xmm2, %xmm2, %xmm6
+vpaddq %xmm8, %xmm8, %xmm15
+vpsrlq $63, %xmm2, %xmm4
+vpsrlq $63, %xmm8, %xmm11
+vpor %xmm6, %xmm4, %xmm3
+vpor %xmm15, %xmm11, %xmm9
+vpxor %xmm0, %xmm7, %xmm0
+vpxor 208(%rsp), %xmm0, %xmm7
+vpxor %xmm10, %xmm5, %xmm0
+vpalignr $8, %xmm9, %xmm3, %xmm4
+vpalignr $8, %xmm13, %xmm14, %xmm5
+vpalignr $8, %xmm3, %xmm9, %xmm3
+vpxor %xmm5, %xmm4, %xmm6
+vpalignr $8, %xmm14, %xmm13, %xmm8
+vpxor %xmm8, %xmm3, %xmm9
+vmovdqu %xmm7, 208(%rsp)
+vpxor 224(%rsp), %xmm0, %xmm2
+vpxor 256(%rsp), %xmm6, %xmm7
+vpxor 240(%rsp), %xmm9, %xmm10
+movq 296(%rsp), %rax
+movq 288(%rsp), %r8
+movq 280(%rsp), %rsi
+vmovdqu %xmm2, 224(%rsp)
+vmovdqu %xmm7, 256(%rsp)
+vmovdqu %xmm10, 240(%rsp)
+cmpq $128, %rdx
+ja blake2b_blocks_avx_L25
+blake2b_blocks_avx_L26:
+vmovdqu 240(%rsp), %xmm8
+vmovdqu 256(%rsp), %xmm7
+vmovdqu 224(%rsp), %xmm6
+vmovdqu 208(%rsp), %xmm5
+movq 128(%rsp), %rdi
+vmovdqu %xmm5, (%rdi)
+vmovdqu %xmm6, 16(%rdi)
+vmovdqu %xmm7, 32(%rdi)
+vmovdqu %xmm8, 48(%rdi)
+movq %r8, 64(%rdi)
+movq %rax, 72(%rdi)
+addq $344, %rsp
+popq %rbx
+popq %r15
+popq %r14
+popq %r13
+popq %r12
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END blake2b_blocks_avx \ No newline at end of file
diff --git a/src/libcryptobox/blake2/blake2.c b/src/libcryptobox/blake2/blake2.c
index f11eb33ef..9c3ce8c2b 100644
--- a/src/libcryptobox/blake2/blake2.c
+++ b/src/libcryptobox/blake2/blake2.c
@@ -50,11 +50,6 @@ typedef struct blake2b_impl_t {
#define BLAKE2B_IMPL(cpuflags, desc, ext) \
{(cpuflags), desc, blake2b_blocks_##ext}
-#if defined(HAVE_AVX2)
-BLAKE2B_DECLARE(avx2)
-#define BLAKE2B_AVX2 BLAKE2B_IMPL(CPUID_AVX2, "avx2", avx2)
-#endif
-
#if defined(HAVE_AVX)
BLAKE2B_DECLARE(avx)
#define BLAKE2B_AVX BLAKE2B_IMPL(CPUID_AVX, "avx", avx)
@@ -69,21 +64,15 @@ BLAKE2B_DECLARE(x86)
BLAKE2B_DECLARE(ref)
#define BLAKE2B_GENERIC BLAKE2B_IMPL(0, "generic", ref)
-
-
/* list implemenations from most optimized to least, with generic as the last entry */
static const blake2b_impl_t blake2b_list[] = {
- /* x86 */
-#if defined(BLAKE2B_AVX2)
- BLAKE2B_AVX2,
-#endif
+ BLAKE2B_GENERIC,
#if defined(BLAKE2B_AVX)
BLAKE2B_AVX,
#endif
#if defined(BLAKE2B_X86)
BLAKE2B_X86,
#endif
- BLAKE2B_GENERIC
};
static const blake2b_impl_t *blake2b_opt = &blake2b_list[0];
diff --git a/src/libcryptobox/blake2/constants.S b/src/libcryptobox/blake2/constants.S
new file mode 100644
index 000000000..5d1a70813
--- /dev/null
+++ b/src/libcryptobox/blake2/constants.S
@@ -0,0 +1,30 @@
+
+.p2align 6
+blake2b_constants:
+.quad 0x6a09e667f3bcc908
+.quad 0xbb67ae8584caa73b
+.quad 0x3c6ef372fe94f82b
+.quad 0xa54ff53a5f1d36f1
+.quad 0x510e527fade682d1
+.quad 0x9b05688c2b3e6c1f
+.quad 0x1f83d9abfb41bd6b
+.quad 0x5be0cd19137e2179
+
+blake2b_sigma:
+.byte 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120
+.byte 112,80,32,64,72,120,104,48,8,96,0,16,88,56,40,24
+.byte 88,64,96,0,40,16,120,104,80,112,24,48,56,8,72,32
+.byte 56,72,24,8,104,96,88,112,16,48,40,80,32,0,120,64
+.byte 72,0,40,56,16,32,80,120,112,8,88,96,48,64,24,104
+.byte 16,96,48,80,0,88,64,24,32,104,56,40,120,112,8,72
+.byte 96,40,8,120,112,104,32,80,0,56,48,24,72,16,64,88
+.byte 104,88,56,112,96,8,24,72,40,0,120,32,64,48,16,80
+.byte 48,120,112,72,88,24,0,64,96,16,104,56,8,32,80,40
+.byte 80,16,64,32,56,48,8,40,120,88,72,112,24,96,104,0
+.byte 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120
+.byte 112,80,32,64,72,120,104,48,8,96,0,16,88,56,40,24
+
+.p2align 4
+blake2b_constants_ssse3:
+.byte 2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9 /* 64 bit rotate right by 16 */
+.byte 3,4,5,6,7,0,1,2,11,12,13,14,15,8,9,10 /* 64 bit rotate right by 24 */ \ No newline at end of file
diff --git a/src/libcryptobox/blake2/ref.c b/src/libcryptobox/blake2/ref.c
index 15b74351b..ed6f395fc 100644
--- a/src/libcryptobox/blake2/ref.c
+++ b/src/libcryptobox/blake2/ref.c
@@ -77,7 +77,7 @@ U64TO8 (unsigned char *p, blake2b_uint64 v)
p[7] = (v >> 56) & 0xff;
}
-static void
+void
blake2b_blocks_ref (blake2b_state_internal *S,
const unsigned char *in,
size_t bytes,
diff --git a/src/libcryptobox/blake2/x86-32.S b/src/libcryptobox/blake2/x86-32.S
new file mode 100644
index 000000000..12030e57b
--- /dev/null
+++ b/src/libcryptobox/blake2/x86-32.S
@@ -0,0 +1,1080 @@
+#include "../macro.S"
+#include "constants.S"
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN blake2b_blocks_x86
+pushl %esi
+pushl %edi
+pushl %ebx
+pushl %ebp
+subl $492, %esp
+movl 512(%esp), %eax
+movl 80(%eax), %ebp
+movl %ebp, %edi
+movl 84(%eax), %ebx
+xorl $-79577749, %edi
+movl %edi, 144(%esp)
+movl %ebx, %edi
+xorl $528734635, %edi
+movl %edi, 148(%esp)
+movl 88(%eax), %edi
+movl 92(%eax), %eax
+xorl $327033209, %edi
+xorl $1541459225, %eax
+movl %edi, 152(%esp)
+LOAD_VAR_PIC blake2b_sigma, %ecx
+lea 192(%ecx), %edi
+movl 516(%esp), %esi
+orl %ebx, %ebp
+movl 520(%esp), %edx
+movl %edi, 360(%esp)
+jne blake2b_blocks_x86_L2
+blake2b_blocks_x86_L32:
+cmpl $128, %edx
+jmp blake2b_blocks_x86_L21
+blake2b_blocks_x86_L2:
+cmpl $128, %edx
+je blake2b_blocks_x86_L21
+blake2b_blocks_x86_L3:
+testb $64, %dl
+lea (%esp), %ebp
+je blake2b_blocks_x86_L5
+blake2b_blocks_x86_L4:
+movl (%esi), %ebx
+movl 4(%esi), %ebp
+movl %ebx, (%esp)
+movl %ebp, 4(%esp)
+movl 8(%esi), %edi
+movl 12(%esi), %ebx
+movl %edi, 8(%esp)
+movl %ebx, 12(%esp)
+movl 16(%esi), %ebp
+movl 20(%esi), %edi
+movl %ebp, 16(%esp)
+movl %edi, 20(%esp)
+movl 24(%esi), %ebx
+movl 28(%esi), %ebp
+movl %ebx, 24(%esp)
+movl %ebp, 28(%esp)
+movl 32(%esi), %edi
+movl 36(%esi), %ebx
+movl %edi, 32(%esp)
+movl %ebx, 36(%esp)
+movl 40(%esi), %ebp
+movl 44(%esi), %edi
+movl %ebp, 40(%esp)
+movl %edi, 44(%esp)
+movl 48(%esi), %ebx
+movl 52(%esi), %ebp
+movl %ebx, 48(%esp)
+movl %ebp, 52(%esp)
+lea 64(%esp), %ebp
+movl 56(%esi), %edi
+movl 60(%esi), %ebx
+addl $64, %esi
+movl %edi, 56(%esp)
+movl %ebx, 60(%esp)
+jmp blake2b_blocks_x86_L6
+blake2b_blocks_x86_L5:
+xorl %ebx, %ebx
+movl %ebx, 64(%esp)
+movl %ebx, 68(%esp)
+movl %ebx, 72(%esp)
+movl %ebx, 76(%esp)
+movl %ebx, 80(%esp)
+movl %ebx, 84(%esp)
+movl %ebx, 88(%esp)
+movl %ebx, 92(%esp)
+movl %ebx, 96(%esp)
+movl %ebx, 100(%esp)
+movl %ebx, 104(%esp)
+movl %ebx, 108(%esp)
+movl %ebx, 112(%esp)
+movl %ebx, 116(%esp)
+movl %ebx, 120(%esp)
+movl %ebx, 124(%esp)
+blake2b_blocks_x86_L6:
+xorl %ebx, %ebx
+testb $32, %dl
+movl %ebx, (%ebp)
+movl %ebx, 4(%ebp)
+movl %ebx, 8(%ebp)
+movl %ebx, 12(%ebp)
+movl %ebx, 16(%ebp)
+movl %ebx, 20(%ebp)
+movl %ebx, 24(%ebp)
+movl %ebx, 28(%ebp)
+movl %ebx, 32(%ebp)
+movl %ebx, 36(%ebp)
+movl %ebx, 40(%ebp)
+movl %ebx, 44(%ebp)
+movl %ebx, 48(%ebp)
+movl %ebx, 52(%ebp)
+movl %ebx, 56(%ebp)
+movl %ebx, 60(%ebp)
+je blake2b_blocks_x86_L8
+blake2b_blocks_x86_L7:
+movl (%esi), %ebx
+movl 4(%esi), %edi
+movl %ebx, (%ebp)
+movl %edi, 4(%ebp)
+movl 8(%esi), %ebx
+movl 12(%esi), %edi
+movl %ebx, 8(%ebp)
+movl %edi, 12(%ebp)
+movl 16(%esi), %ebx
+movl 20(%esi), %edi
+movl %ebx, 16(%ebp)
+movl %edi, 20(%ebp)
+movl 24(%esi), %ebx
+movl 28(%esi), %edi
+addl $32, %esi
+movl %ebx, 24(%ebp)
+movl %edi, 28(%ebp)
+addl $32, %ebp
+blake2b_blocks_x86_L8:
+testb $16, %dl
+je blake2b_blocks_x86_L10
+blake2b_blocks_x86_L9:
+movl (%esi), %ebx
+movl 4(%esi), %edi
+movl %ebx, (%ebp)
+movl %edi, 4(%ebp)
+movl 8(%esi), %ebx
+movl 12(%esi), %edi
+addl $16, %esi
+movl %ebx, 8(%ebp)
+movl %edi, 12(%ebp)
+addl $16, %ebp
+blake2b_blocks_x86_L10:
+testb $8, %dl
+je blake2b_blocks_x86_L12
+blake2b_blocks_x86_L11:
+movl (%esi), %ebx
+movl 4(%esi), %edi
+addl $8, %esi
+movl %ebx, (%ebp)
+movl %edi, 4(%ebp)
+addl $8, %ebp
+blake2b_blocks_x86_L12:
+testb $4, %dl
+je blake2b_blocks_x86_L14
+blake2b_blocks_x86_L13:
+movl (%esi), %ebx
+addl $4, %esi
+movl %ebx, (%ebp)
+addl $4, %ebp
+blake2b_blocks_x86_L14:
+testb $2, %dl
+je blake2b_blocks_x86_L16
+blake2b_blocks_x86_L15:
+movzwl (%esi), %ebx
+addl $2, %esi
+movw %bx, (%ebp)
+addl $2, %ebp
+blake2b_blocks_x86_L16:
+testb $1, %dl
+je blake2b_blocks_x86_L18
+blake2b_blocks_x86_L17:
+movzbl (%esi), %ebx
+movb %bl, (%ebp)
+blake2b_blocks_x86_L18:
+cmpl $128, %edx
+lea (%esp), %esi
+blake2b_blocks_x86_L21:
+movl 512(%esp), %ebp
+lea (%ecx), %ecx
+movl %esi, 236(%esp)
+movl %ecx, 128(%esp)
+movl 68(%ebp), %edi
+movl %edi, 228(%esp)
+movl 60(%ebp), %edi
+movl %edi, 196(%esp)
+movl 72(%ebp), %edi
+movl %edi, 164(%esp)
+movl 76(%ebp), %edi
+movl %edi, 200(%esp)
+movl 24(%ebp), %edi
+movl %edi, 176(%esp)
+movl 28(%ebp), %edi
+movl %edi, 208(%esp)
+movl 16(%ebp), %edi
+movl %edi, 184(%esp)
+movl 20(%ebp), %edi
+movl %edi, 216(%esp)
+movl 48(%ebp), %edi
+movl %edi, 168(%esp)
+movl 52(%ebp), %edi
+movl %edi, 204(%esp)
+movl 8(%ebp), %edi
+movl 64(%ebp), %ebx
+movl %edi, 156(%esp)
+movl 12(%ebp), %edi
+movl %ebx, 192(%esp)
+movl 56(%ebp), %ebx
+movl %edi, 224(%esp)
+movl 40(%ebp), %edi
+movl %ebx, 172(%esp)
+movl %edx, %ebx
+movl %edi, 160(%esp)
+movl 44(%ebp), %edi
+jbe blake2b_blocks_x86_LL3
+movl $128, %ebx
+blake2b_blocks_x86_LL3:
+movl %edi, 212(%esp)
+movl (%ebp), %edi
+movl %edi, 180(%esp)
+movl 4(%ebp), %edi
+movl %edi, 232(%esp)
+movl 32(%ebp), %edi
+movl 36(%ebp), %ebp
+movl %edi, 188(%esp)
+movl %ebp, 220(%esp)
+movl %eax, 132(%esp)
+movl %ebx, 136(%esp)
+movl %edx, 140(%esp)
+movl 512(%esp), %esi
+jmp blake2b_blocks_x86_L22
+blake2b_blocks_x86_L28:
+movl 524(%esp), %eax
+movl 140(%esp), %edx
+addl $-128, %edx
+addl %eax, 236(%esp)
+movl %edx, 140(%esp)
+blake2b_blocks_x86_L22:
+movl 136(%esp), %edx
+xorl %ebx, %ebx
+movl 192(%esp), %eax
+addl %edx, %eax
+movl 228(%esp), %ecx
+adcl $0, %ecx
+movl %eax, 192(%esp)
+movl %eax, 64(%esi)
+subl %edx, %eax
+movl %ecx, 228(%esp)
+movl %ecx, 68(%esi)
+sbbl %ebx, %ecx
+jae blake2b_blocks_x86_L25
+blake2b_blocks_x86_L23:
+movl 164(%esp), %eax
+addl $1, %eax
+movl 200(%esp), %edx
+adcl $0, %edx
+movl %eax, 164(%esp)
+movl %edx, 200(%esp)
+movl %eax, 72(%esi)
+movl %edx, 76(%esi)
+blake2b_blocks_x86_L25:
+movl 152(%esp), %eax
+movl %eax, 312(%esp)
+movl 172(%esp), %ebp
+movl 196(%esp), %ebx
+movl 144(%esp), %eax
+movl 184(%esp), %edi
+movl %ebp, 284(%esp)
+movl %ebx, 288(%esp)
+movl %eax, 296(%esp)
+movl 168(%esp), %ebp
+movl 204(%esp), %ebx
+movl 212(%esp), %eax
+movl %edi, 332(%esp)
+movl %ebp, 276(%esp)
+movl %ebx, 280(%esp)
+movl 148(%esp), %edi
+movl %eax, 272(%esp)
+movl 224(%esp), %ebp
+movl 160(%esp), %ebx
+movl 188(%esp), %eax
+movl 208(%esp), %ecx
+movl %edi, 300(%esp)
+movl %ebp, 248(%esp)
+movl %ebx, 268(%esp)
+movl 180(%esp), %edi
+movl %eax, 260(%esp)
+movl 176(%esp), %edx
+movl 164(%esp), %ebp
+movl 232(%esp), %ebx
+xorl $725511199, %ebp
+movl 128(%esp), %eax
+movl %ebp, 348(%esp)
+movl %ecx, 256(%esp)
+movl 200(%esp), %ebp
+movl 216(%esp), %ecx
+xorl $-1694144372, %ebp
+movl %edi, 240(%esp)
+movl %edx, 316(%esp)
+movl %ebx, 244(%esp)
+movl 220(%esp), %edi
+movl %eax, 292(%esp)
+movl 192(%esp), %ebx
+xorl $-1377402159, %ebx
+movl %ebx, 352(%esp)
+movl %ecx, 252(%esp)
+movl 228(%esp), %ebx
+movl %ebp, 356(%esp)
+xorl $1359893119, %ebx
+movl 132(%esp), %edx
+movl 156(%esp), %ecx
+movl 332(%esp), %ebp
+movl 316(%esp), %esi
+movl %edi, 264(%esp)
+movl $1595750129, 308(%esp)
+movl $-1521486534, 304(%esp)
+movl $-23791573, 324(%esp)
+movl $1013904242, 320(%esp)
+movl $-2067093701, 340(%esp)
+movl $-1150833019, 336(%esp)
+movl $-205731576, 328(%esp)
+movl $1779033703, 344(%esp)
+blake2b_blocks_x86_L26:
+movl %esi, 316(%esp)
+movl %edx, 368(%esp)
+movzbl (%eax), %esi
+movl 236(%esp), %edx
+movl %ecx, 364(%esp)
+movl 240(%esp), %ecx
+addl (%esi,%edx), %ecx
+movl %ebp, 332(%esp)
+movl 244(%esp), %ebp
+adcl 4(%esi,%edx), %ebp
+movl 260(%esp), %edx
+addl %edx, %ecx
+movl 264(%esp), %esi
+adcl %esi, %ebp
+xorl %ebp, %ebx
+movl 352(%esp), %edi
+movl %ecx, 240(%esp)
+xorl %ecx, %edi
+movl 328(%esp), %ecx
+addl %ebx, %ecx
+movl %ebx, 372(%esp)
+movl 344(%esp), %ebx
+adcl %edi, %ebx
+xorl %ecx, %edx
+xorl %ebx, %esi
+movl %edi, 352(%esp)
+movl %edx, %edi
+movl %ecx, 328(%esp)
+movl %esi, %ecx
+shrl $24, %esi
+shll $8, %edx
+orl %edx, %esi
+movl %esi, 264(%esp)
+movzbl 2(%eax), %edx
+movl 236(%esp), %esi
+shll $8, %ecx
+shrl $24, %edi
+orl %edi, %ecx
+movl %ecx, 376(%esp)
+movl 364(%esp), %ecx
+addl (%edx,%esi), %ecx
+movl 248(%esp), %edi
+movl %ebp, 244(%esp)
+movl 268(%esp), %ebp
+adcl 4(%edx,%esi), %edi
+addl %ebp, %ecx
+movl 272(%esp), %edx
+adcl %edx, %edi
+movl %ebx, 344(%esp)
+movl %ecx, 364(%esp)
+movl 348(%esp), %ebx
+xorl %ecx, %ebx
+movl 356(%esp), %ecx
+xorl %edi, %ecx
+movl %edi, 248(%esp)
+movl 340(%esp), %edi
+addl %ecx, %edi
+movl %ecx, 356(%esp)
+movl 336(%esp), %ecx
+adcl %ebx, %ecx
+xorl %edi, %ebp
+xorl %ecx, %edx
+movl %ebx, 348(%esp)
+movl %edx, %ebx
+movl %edi, 340(%esp)
+movl %ebp, %edi
+shrl $24, %edx
+shll $8, %ebp
+orl %ebp, %edx
+movzbl 4(%eax), %ebp
+movl %ecx, 336(%esp)
+shll $8, %ebx
+shrl $24, %edi
+movl 332(%esp), %ecx
+orl %edi, %ebx
+addl (%ebp,%esi), %ecx
+movl 252(%esp), %edi
+adcl 4(%ebp,%esi), %edi
+movl 276(%esp), %ebp
+addl %ebp, %ecx
+movl %edx, 272(%esp)
+movl 280(%esp), %edx
+adcl %edx, %edi
+movl %ebx, 380(%esp)
+movl %ecx, 332(%esp)
+movl 296(%esp), %ebx
+xorl %ecx, %ebx
+movl 300(%esp), %ecx
+xorl %edi, %ecx
+movl %edi, 252(%esp)
+movl 324(%esp), %edi
+addl %ecx, %edi
+movl %ecx, 300(%esp)
+movl 320(%esp), %ecx
+adcl %ebx, %ecx
+xorl %edi, %ebp
+xorl %ecx, %edx
+movl %ebx, 296(%esp)
+movl %edx, %ebx
+movl %edi, 324(%esp)
+movl %ebp, %edi
+shrl $24, %edx
+shll $8, %ebp
+orl %ebp, %edx
+movl %edx, 280(%esp)
+movzbl 6(%eax), %edx
+movl %ecx, 320(%esp)
+shll $8, %ebx
+shrl $24, %edi
+movl 316(%esp), %ecx
+orl %edi, %ebx
+addl (%edx,%esi), %ecx
+movl 256(%esp), %edi
+movl 284(%esp), %ebp
+adcl 4(%edx,%esi), %edi
+addl %ebp, %ecx
+movl 288(%esp), %edx
+adcl %edx, %edi
+movl %ebx, 384(%esp)
+movl %ecx, 316(%esp)
+movl 312(%esp), %ebx
+xorl %ecx, %ebx
+movl 368(%esp), %ecx
+xorl %edi, %ecx
+movl %edi, 256(%esp)
+movl 308(%esp), %edi
+addl %ecx, %edi
+movl %ecx, 368(%esp)
+movl 304(%esp), %ecx
+adcl %ebx, %ecx
+xorl %edi, %ebp
+xorl %ecx, %edx
+movl %ebx, 312(%esp)
+movl %edx, %ebx
+movl %edi, 308(%esp)
+movl %ebp, %edi
+shrl $24, %edx
+shll $8, %ebp
+orl %ebp, %edx
+movzbl 5(%eax), %ebp
+movl %ecx, 304(%esp)
+shll $8, %ebx
+movl (%ebp,%esi), %ecx
+addl 332(%esp), %ecx
+movl 4(%ebp,%esi), %esi
+adcl 252(%esp), %esi
+shrl $24, %edi
+orl %edi, %ebx
+movl %ebx, 388(%esp)
+movl 384(%esp), %ebx
+addl %ebx, %ecx
+movl %edx, 288(%esp)
+movl 280(%esp), %edx
+adcl %edx, %esi
+movl 300(%esp), %ebp
+movl 296(%esp), %edi
+xorl %ecx, %ebp
+xorl %esi, %edi
+movl %ecx, 392(%esp)
+movl %ebp, %ecx
+movl %esi, 396(%esp)
+movl %edi, %esi
+shll $16, %esi
+shrl $16, %ecx
+shrl $16, %edi
+orl %ecx, %esi
+shll $16, %ebp
+orl %ebp, %edi
+movl 324(%esp), %ebp
+addl %esi, %ebp
+movl %esi, 400(%esp)
+movl 320(%esp), %esi
+adcl %edi, %esi
+xorl %ebp, %ebx
+xorl %esi, %edx
+movl %esi, 320(%esp)
+movl %edx, %esi
+movl %edi, 296(%esp)
+movl %ebx, %edi
+shrl $31, %esi
+addl %ebx, %ebx
+shrl $31, %edi
+addl %edx, %edx
+orl %ebx, %esi
+orl %edx, %edi
+movl %esi, 408(%esp)
+movzbl 7(%eax), %edx
+movl 236(%esp), %esi
+movl %edi, 404(%esp)
+movl 288(%esp), %edi
+movl (%edx,%esi), %ebx
+addl 316(%esp), %ebx
+movl 4(%edx,%esi), %ecx
+movl 388(%esp), %edx
+adcl 256(%esp), %ecx
+addl %edx, %ebx
+movl %ebp, 324(%esp)
+adcl %edi, %ecx
+movl 368(%esp), %ebp
+movl 312(%esp), %esi
+xorl %ebx, %ebp
+xorl %ecx, %esi
+movl %ebx, 412(%esp)
+movl %ebp, %ebx
+movl %ecx, 416(%esp)
+movl %esi, %ecx
+shll $16, %ecx
+shrl $16, %ebx
+shrl $16, %esi
+orl %ebx, %ecx
+shll $16, %ebp
+orl %ebp, %esi
+movl 308(%esp), %ebp
+addl %ecx, %ebp
+movl %ecx, 420(%esp)
+movl 304(%esp), %ecx
+adcl %esi, %ecx
+xorl %ebp, %edx
+movl %esi, 312(%esp)
+xorl %ecx, %edi
+movl %edx, %esi
+movl %edi, %ebx
+shrl $31, %esi
+addl %edi, %edi
+orl %edi, %esi
+addl %edx, %edx
+movl %esi, 424(%esp)
+movzbl 3(%eax), %edi
+movl 236(%esp), %esi
+shrl $31, %ebx
+orl %edx, %ebx
+movl (%edi,%esi), %edx
+addl 364(%esp), %edx
+movl %ecx, 304(%esp)
+movl 4(%edi,%esi), %ecx
+movl 380(%esp), %edi
+adcl 248(%esp), %ecx
+addl %edi, %edx
+movl 272(%esp), %esi
+adcl %esi, %ecx
+movl %ebp, 308(%esp)
+movl %ebx, 428(%esp)
+movl 356(%esp), %ebx
+movl 348(%esp), %ebp
+xorl %edx, %ebx
+xorl %ecx, %ebp
+movl %edx, 432(%esp)
+movl %ebp, %edx
+movl %ecx, 436(%esp)
+movl %ebx, %ecx
+shll $16, %edx
+shrl $16, %ecx
+shrl $16, %ebp
+orl %ecx, %edx
+shll $16, %ebx
+orl %ebx, %ebp
+movl 340(%esp), %ebx
+addl %edx, %ebx
+movl %edx, 440(%esp)
+movl 336(%esp), %edx
+adcl %ebp, %edx
+xorl %ebx, %edi
+movl %ebx, 340(%esp)
+xorl %edx, %esi
+movl %edi, %ebx
+movl %esi, %ecx
+shrl $31, %ebx
+addl %esi, %esi
+movl %edx, 336(%esp)
+orl %esi, %ebx
+movzbl 1(%eax), %esi
+addl %edi, %edi
+movl 236(%esp), %edx
+shrl $31, %ecx
+orl %edi, %ecx
+movl (%esi,%edx), %edi
+addl 240(%esp), %edi
+movl %ebp, 348(%esp)
+movl 4(%esi,%edx), %ebp
+movl 376(%esp), %esi
+adcl 244(%esp), %ebp
+addl %esi, %edi
+movl %ecx, 448(%esp)
+movl 264(%esp), %ecx
+adcl %ecx, %ebp
+movl %ebx, 444(%esp)
+movl 372(%esp), %ebx
+movl 352(%esp), %edx
+xorl %edi, %ebx
+xorl %ebp, %edx
+movl %edi, 452(%esp)
+movl %edx, %edi
+movl %ebp, 456(%esp)
+movl %ebx, %ebp
+shll $16, %edi
+shrl $16, %ebp
+shrl $16, %edx
+orl %ebp, %edi
+shll $16, %ebx
+orl %ebx, %edx
+movl 328(%esp), %ebx
+addl %edi, %ebx
+movl %edi, 460(%esp)
+movl 344(%esp), %edi
+adcl %edx, %edi
+xorl %ebx, %esi
+movl %edx, 352(%esp)
+xorl %edi, %ecx
+movl %esi, %edx
+addl %esi, %esi
+movl %ebx, 328(%esp)
+movl %ecx, %ebx
+shrl $31, %edx
+addl %ecx, %ecx
+movl %edi, 344(%esp)
+orl %ecx, %edx
+movzbl 8(%eax), %edi
+movl 236(%esp), %ecx
+shrl $31, %ebx
+orl %esi, %ebx
+movl %ebx, 468(%esp)
+movl 452(%esp), %ebx
+addl (%edi,%ecx), %ebx
+movl 456(%esp), %esi
+movl %edx, 464(%esp)
+movl 448(%esp), %edx
+adcl 4(%edi,%ecx), %esi
+addl %edx, %ebx
+movl 444(%esp), %edi
+adcl %edi, %esi
+movl 420(%esp), %ebp
+movl %ebx, 452(%esp)
+xorl %ebx, %ebp
+movl 312(%esp), %ebx
+xorl %esi, %ebx
+movl %esi, 456(%esp)
+movl 324(%esp), %esi
+addl %ebx, %esi
+movl %ebx, 312(%esp)
+movl 320(%esp), %ebx
+adcl %ebp, %ebx
+xorl %esi, %edx
+xorl %ebx, %edi
+movl %ebp, 420(%esp)
+movzbl 10(%eax), %ebp
+movl %esi, 324(%esp)
+movl %edx, %esi
+movl %ebx, 320(%esp)
+movl %edi, %ebx
+shll $8, %ebx
+shrl $24, %esi
+orl %esi, %ebx
+movl %ebx, 472(%esp)
+movl (%ebp,%ecx), %ebx
+addl 432(%esp), %ebx
+movl 4(%ebp,%ecx), %esi
+adcl 436(%esp), %esi
+shrl $24, %edi
+shll $8, %edx
+orl %edx, %edi
+movl 408(%esp), %edx
+addl %edx, %ebx
+movl %edi, 444(%esp)
+movl 404(%esp), %edi
+adcl %edi, %esi
+movl 460(%esp), %ebp
+movl %ebx, 364(%esp)
+xorl %ebx, %ebp
+movl 352(%esp), %ebx
+xorl %esi, %ebx
+movl %esi, 248(%esp)
+movl 308(%esp), %esi
+addl %ebx, %esi
+movl %ebx, 352(%esp)
+movl 304(%esp), %ebx
+adcl %ebp, %ebx
+xorl %esi, %edx
+xorl %ebx, %edi
+movl %esi, 308(%esp)
+movl %edx, %esi
+movl %ebx, 304(%esp)
+movl %edi, %ebx
+shrl $24, %edi
+shll $8, %edx
+orl %edx, %edi
+movl %edi, 404(%esp)
+movzbl 12(%eax), %edi
+movl %ebp, 460(%esp)
+shll $8, %ebx
+shrl $24, %esi
+movl (%edi,%ecx), %ebp
+orl %esi, %ebx
+addl 392(%esp), %ebp
+movl 4(%edi,%ecx), %esi
+movl 428(%esp), %edx
+adcl 396(%esp), %esi
+addl %edx, %ebp
+movl %ebx, 476(%esp)
+movl 424(%esp), %ebx
+adcl %ebx, %esi
+movl 440(%esp), %edi
+movl %ebp, 332(%esp)
+xorl %ebp, %edi
+movl 348(%esp), %ebp
+xorl %esi, %ebp
+movl %esi, 252(%esp)
+movl 328(%esp), %esi
+addl %ebp, %esi
+movl %ebp, 348(%esp)
+movl 344(%esp), %ebp
+adcl %edi, %ebp
+xorl %esi, %edx
+xorl %ebp, %ebx
+movl %esi, 328(%esp)
+movl %edx, %esi
+movl %ebp, 344(%esp)
+movl %ebx, %ebp
+shrl $24, %ebx
+shll $8, %edx
+orl %edx, %ebx
+movzbl 14(%eax), %edx
+movl %eax, 292(%esp)
+shll $8, %ebp
+shrl $24, %esi
+movl (%edx,%ecx), %eax
+orl %esi, %ebp
+addl 412(%esp), %eax
+movl 4(%edx,%ecx), %esi
+movl 468(%esp), %ecx
+adcl 416(%esp), %esi
+addl %ecx, %eax
+movl 464(%esp), %edx
+adcl %edx, %esi
+movl %edi, 440(%esp)
+movl %eax, 316(%esp)
+movl 400(%esp), %edi
+xorl %eax, %edi
+movl 296(%esp), %eax
+xorl %esi, %eax
+movl %esi, 256(%esp)
+movl 340(%esp), %esi
+addl %eax, %esi
+movl %eax, 296(%esp)
+movl 336(%esp), %eax
+adcl %edi, %eax
+xorl %esi, %ecx
+xorl %eax, %edx
+movl %edi, 400(%esp)
+movl %ecx, %edi
+movl %esi, 340(%esp)
+movl %edx, %esi
+shrl $24, %edx
+shll $8, %ecx
+orl %ecx, %edx
+movl %edx, 464(%esp)
+movl 292(%esp), %edx
+shll $8, %esi
+shrl $24, %edi
+orl %edi, %esi
+movzbl 13(%edx), %edi
+movl 236(%esp), %edx
+movl 332(%esp), %ecx
+addl %ebp, %ecx
+movl %eax, 336(%esp)
+movl 252(%esp), %eax
+adcl %ebx, %eax
+addl (%edi,%edx), %ecx
+movl %ecx, 332(%esp)
+adcl 4(%edi,%edx), %eax
+movl 348(%esp), %edi
+movl 440(%esp), %edx
+xorl %ecx, %edi
+xorl %eax, %edx
+movl %edi, %ecx
+movl %eax, 252(%esp)
+movl %edx, %eax
+shll $16, %eax
+shrl $16, %ecx
+shrl $16, %edx
+orl %ecx, %eax
+shll $16, %edi
+orl %edx, %edi
+movl 328(%esp), %edx
+addl %eax, %edx
+movl %eax, 348(%esp)
+movl 344(%esp), %eax
+adcl %edi, %eax
+xorl %edx, %ebp
+xorl %eax, %ebx
+movl %eax, 344(%esp)
+movl %ebx, %eax
+movl %edi, 356(%esp)
+movl %ebp, %edi
+shrl $31, %eax
+addl %ebp, %ebp
+orl %ebp, %eax
+addl %ebx, %ebx
+movl %eax, 284(%esp)
+movl 292(%esp), %eax
+shrl $31, %edi
+orl %ebx, %edi
+movl %edi, 288(%esp)
+movzbl 15(%eax), %ebx
+movl 236(%esp), %edi
+movl 316(%esp), %ebp
+addl %esi, %ebp
+movl %edx, 328(%esp)
+movl 256(%esp), %edx
+movl 464(%esp), %ecx
+adcl %ecx, %edx
+addl (%ebx,%edi), %ebp
+movl %ebp, 316(%esp)
+adcl 4(%ebx,%edi), %edx
+movl 296(%esp), %edi
+movl 400(%esp), %ebx
+xorl %ebp, %edi
+xorl %edx, %ebx
+movl %edi, %ebp
+movl %edx, 256(%esp)
+movl %ebx, %edx
+shll $16, %edx
+shrl $16, %ebp
+shrl $16, %ebx
+orl %ebp, %edx
+shll $16, %edi
+orl %ebx, %edi
+movl 340(%esp), %ebx
+addl %edx, %ebx
+movl %edx, 296(%esp)
+movl 336(%esp), %edx
+adcl %edi, %edx
+xorl %ebx, %esi
+xorl %edx, %ecx
+movl %edx, 336(%esp)
+movl %ecx, %edx
+movl %edi, 300(%esp)
+movl %esi, %edi
+shrl $31, %edx
+addl %esi, %esi
+shrl $31, %edi
+addl %ecx, %ecx
+movl %ebx, 340(%esp)
+orl %esi, %edx
+movzbl 11(%eax), %ebp
+orl %ecx, %edi
+movl 236(%esp), %ebx
+movl %edx, 260(%esp)
+movl 364(%esp), %ecx
+movl 476(%esp), %edx
+addl %edx, %ecx
+movl %edi, 264(%esp)
+movl 248(%esp), %edi
+movl 404(%esp), %esi
+adcl %esi, %edi
+addl (%ebp,%ebx), %ecx
+movl %ecx, 364(%esp)
+adcl 4(%ebp,%ebx), %edi
+movl 352(%esp), %ebp
+movl 460(%esp), %ebx
+xorl %ecx, %ebp
+xorl %edi, %ebx
+movl %ebp, %ecx
+movl %edi, 248(%esp)
+movl %ebx, %edi
+shll $16, %edi
+shrl $16, %ecx
+shrl $16, %ebx
+orl %ecx, %edi
+shll $16, %ebp
+orl %ebx, %ebp
+movl 308(%esp), %ebx
+addl %edi, %ebx
+movl %edi, 352(%esp)
+movl 304(%esp), %edi
+adcl %ebp, %edi
+xorl %ebx, %edx
+xorl %edi, %esi
+movl %edi, 304(%esp)
+movl %esi, %edi
+movl %ebp, 372(%esp)
+movl %edx, %ebp
+shrl $31, %edi
+addl %edx, %edx
+shrl $31, %ebp
+addl %esi, %esi
+movzbl 9(%eax), %ecx
+orl %edx, %edi
+movl 236(%esp), %edx
+orl %esi, %ebp
+movl %ebx, 308(%esp)
+addl $16, %eax
+movl %edi, 276(%esp)
+movl 452(%esp), %ebx
+movl 472(%esp), %edi
+addl %edi, %ebx
+movl %ebp, 280(%esp)
+movl 456(%esp), %ebp
+movl 444(%esp), %esi
+adcl %esi, %ebp
+addl (%ecx,%edx), %ebx
+movl %ebx, 240(%esp)
+adcl 4(%ecx,%edx), %ebp
+movl 312(%esp), %edx
+movl 420(%esp), %ecx
+xorl %ebx, %edx
+xorl %ebp, %ecx
+movl %ebp, 244(%esp)
+movl %ecx, %ebx
+movl %edx, %ebp
+shll $16, %ebx
+shrl $16, %ebp
+shrl $16, %ecx
+orl %ebp, %ebx
+shll $16, %edx
+orl %ecx, %edx
+movl 324(%esp), %ecx
+addl %ebx, %ecx
+movl %ebx, 312(%esp)
+movl 320(%esp), %ebx
+adcl %edx, %ebx
+xorl %ecx, %edi
+xorl %ebx, %esi
+movl %edi, %ebp
+movl %ecx, 324(%esp)
+movl %esi, %ecx
+shrl $31, %ecx
+addl %edi, %edi
+shrl $31, %ebp
+addl %esi, %esi
+orl %esi, %ebp
+orl %edi, %ecx
+movl %ebx, 320(%esp)
+movl %ebp, 272(%esp)
+movl %ecx, 268(%esp)
+movl 332(%esp), %ebp
+movl 316(%esp), %esi
+movl 364(%esp), %ecx
+movl 372(%esp), %ebx
+cmpl 360(%esp), %eax
+jb blake2b_blocks_x86_L26
+blake2b_blocks_x86_L27:
+movl 328(%esp), %edi
+xorl 240(%esp), %edi
+movl %esi, 316(%esp)
+movl 512(%esp), %esi
+movl 180(%esp), %eax
+movl %edx, 368(%esp)
+xorl %edi, %eax
+movl 344(%esp), %edx
+movl %eax, 180(%esp)
+movl %eax, (%esi)
+movl 340(%esp), %eax
+xorl %ecx, %eax
+movl 336(%esp), %ecx
+xorl 244(%esp), %edx
+xorl 248(%esp), %ecx
+movl 232(%esp), %edi
+xorl %edx, %edi
+movl 156(%esp), %edx
+xorl %eax, %edx
+movl 224(%esp), %eax
+movl %edi, 232(%esp)
+xorl %ecx, %eax
+movl %edi, 4(%esi)
+movl %ebp, 332(%esp)
+movl %eax, 224(%esp)
+movl %eax, 12(%esi)
+movl 324(%esp), %edi
+movl 320(%esp), %eax
+xorl 332(%esp), %edi
+xorl 252(%esp), %eax
+movl %edx, 156(%esp)
+movl %edx, 8(%esi)
+movl 184(%esp), %edx
+movl 216(%esp), %ecx
+xorl %edi, %edx
+movl %edx, 184(%esp)
+xorl %eax, %ecx
+movl %edx, 16(%esi)
+movl 308(%esp), %eax
+movl 304(%esp), %edx
+xorl 316(%esp), %eax
+xorl 256(%esp), %edx
+movl 176(%esp), %edi
+xorl 264(%esp), %ebx
+xorl %eax, %edi
+movl 208(%esp), %eax
+xorl %edx, %eax
+movl %eax, 208(%esp)
+movl %eax, 28(%esi)
+movl 352(%esp), %edx
+movl 220(%esp), %eax
+movl 356(%esp), %ebp
+xorl %ebx, %eax
+movl 348(%esp), %ebx
+xorl 260(%esp), %edx
+xorl 268(%esp), %ebx
+xorl 272(%esp), %ebp
+movl %ecx, 216(%esp)
+movl %ecx, 20(%esi)
+movl 188(%esp), %ecx
+movl %eax, 220(%esp)
+xorl %edx, %ecx
+movl %eax, 36(%esi)
+movl 160(%esp), %eax
+movl 212(%esp), %edx
+xorl %ebx, %eax
+xorl %ebp, %edx
+movl 296(%esp), %ebp
+movl %eax, 160(%esp)
+movl %eax, 40(%esi)
+movl %edi, 176(%esp)
+movl %edi, 24(%esi)
+movl 300(%esp), %eax
+movl 312(%esp), %ebx
+movl 368(%esp), %edi
+xorl 276(%esp), %ebp
+xorl 280(%esp), %eax
+xorl 284(%esp), %ebx
+xorl 288(%esp), %edi
+movl %edx, 212(%esp)
+movl %edx, 44(%esi)
+movl 168(%esp), %edx
+movl %ecx, 188(%esp)
+xorl %ebp, %edx
+movl %ecx, 32(%esi)
+movl %edx, 168(%esp)
+movl 204(%esp), %ecx
+movl %edx, 48(%esi)
+xorl %eax, %ecx
+movl 172(%esp), %eax
+movl 196(%esp), %edx
+xorl %ebx, %eax
+xorl %edi, %edx
+movl %ecx, 204(%esp)
+movl %ecx, 52(%esi)
+movl %eax, 172(%esp)
+movl %edx, 196(%esp)
+movl %eax, 56(%esi)
+movl %edx, 60(%esi)
+cmpl $128, 140(%esp)
+ja blake2b_blocks_x86_L28
+blake2b_blocks_x86_L29:
+addl $492, %esp
+popl %ebp
+popl %ebx
+popl %edi
+popl %esi
+ret
+FN_END blake2b_blocks_x86
diff --git a/src/libcryptobox/blake2/x86-64.S b/src/libcryptobox/blake2/x86-64.S
new file mode 100644
index 000000000..f0de795fb
--- /dev/null
+++ b/src/libcryptobox/blake2/x86-64.S
@@ -0,0 +1,1754 @@
+#include "../macro.S"
+#include "constants.S"
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN_EXT blake2b_blocks_x86, 4, 8
+pushq %rbx
+pushq %rbp
+pushq %r12
+pushq %r13
+pushq %r14
+pushq %r15
+movq %rsp, %r9
+subq $320, %rsp
+andq $~63, %rsp
+cmpq $128, %rdx
+movq %rdx, %rax
+jb blake2b_blocks_x86_usebytesinc
+movq $128, %rax
+blake2b_blocks_x86_usebytesinc:
+movq %rdx, 136(%rsp)
+movq %rcx, 144(%rsp)
+movq %rax, 152(%rsp)
+movq %rdi, 160(%rsp)
+movq %r9, 168(%rsp)
+movq 80(%rdi), %rcx
+andq %rcx, %rcx
+jz blake2b_blocks_x86_not_final_call
+cmpq $128, %rdx
+je blake2b_blocks_x86_not_final_call
+leaq 0(%rsp), %rcx
+pxor %xmm0, %xmm0
+movdqa %xmm0, 0(%rcx)
+movdqa %xmm0, 16(%rcx)
+movdqa %xmm0, 32(%rcx)
+movdqa %xmm0, 48(%rcx)
+movdqa %xmm0, 64(%rcx)
+movdqa %xmm0, 80(%rcx)
+movdqa %xmm0, 96(%rcx)
+movdqa %xmm0, 112(%rcx)
+testq $0x40, %rdx
+jz blake2b_blocks_x86_skip64
+movdqu 0(%rsi), %xmm0
+movdqu 16(%rsi), %xmm1
+movdqu 32(%rsi), %xmm2
+movdqu 48(%rsi), %xmm3
+movdqa %xmm0, 0(%rcx)
+movdqa %xmm1, 16(%rcx)
+movdqa %xmm2, 32(%rcx)
+movdqa %xmm3, 48(%rcx)
+addq $64, %rsi
+addq $64, %rcx
+blake2b_blocks_x86_skip64:
+testq $0x20, %rdx
+jz blake2b_blocks_x86_skip32
+movdqu 0(%rsi), %xmm0
+movdqu 16(%rsi), %xmm1
+movdqa %xmm0, 0(%rcx)
+movdqa %xmm1, 16(%rcx)
+addq $32, %rsi
+addq $32, %rcx
+blake2b_blocks_x86_skip32:
+testq $0x10, %rdx
+jz blake2b_blocks_x86_skip16
+movdqu 0(%rsi), %xmm0
+movdqa %xmm0, 0(%rcx)
+addq $16, %rsi
+addq $16, %rcx
+blake2b_blocks_x86_skip16:
+testq $0x8, %rdx
+jz blake2b_blocks_x86_skip8
+movq 0(%rsi), %rax
+movq %rax, 0(%rcx)
+addq $8, %rsi
+addq $8, %rcx
+blake2b_blocks_x86_skip8:
+testq $0x4, %rdx
+jz blake2b_blocks_x86_skip4
+movl 0(%rsi), %eax
+movl %eax, 0(%rcx)
+addq $4, %rsi
+addq $4, %rcx
+blake2b_blocks_x86_skip4:
+testq $0x2, %rdx
+jz blake2b_blocks_x86_skip2
+movw 0(%rsi), %ax
+movw %ax, 0(%rcx)
+addq $2, %rsi
+addq $2, %rcx
+blake2b_blocks_x86_skip2:
+testq $0x1, %rdx
+jz blake2b_blocks_x86_skip1
+movb 0(%rsi), %al
+movb %al, 0(%rcx)
+blake2b_blocks_x86_skip1:
+leaq 0(%rsp), %rsi
+blake2b_blocks_x86_not_final_call:
+movq %rsi, 128(%rsp)
+movq 64(%rdi), %r12
+movq 72(%rdi), %r13
+movq 80(%rdi), %r14
+movq 88(%rdi), %r15
+movabsq $0x1f83d9abfb41bd6b, %rax
+movabsq $0x5be0cd19137e2179, %rbx
+xorq %rax, %r14
+xorq %rbx, %r15
+movq %r12, 256(%rsp)
+movq %r13, 264(%rsp)
+movq %r14, 272(%rsp)
+movq %r15, 280(%rsp)
+movq 0(%rdi), %rax
+movq 8(%rdi), %rdx
+movq 16(%rdi), %r8
+movq 24(%rdi), %r12
+movq 32(%rdi), %rbx
+movq 40(%rdi), %rsi
+movq 48(%rdi), %r9
+movq 56(%rdi), %r13
+.p2align 6,,63
+blake2b_blocks_x86_mainloop:
+movq 128(%rsp), %r10
+cmpq %r10, %rsp
+je blake2b_blocks_x86_nocopy
+movdqu 0(%r10), %xmm0
+movdqu 16(%r10), %xmm1
+movdqu 32(%r10), %xmm2
+movdqu 48(%r10), %xmm3
+movdqu 64(%r10), %xmm4
+movdqu 80(%r10), %xmm5
+movdqu 96(%r10), %xmm6
+movdqu 112(%r10), %xmm7
+movdqa %xmm0, 0(%rsp)
+movdqa %xmm1, 16(%rsp)
+movdqa %xmm2, 32(%rsp)
+movdqa %xmm3, 48(%rsp)
+movdqa %xmm4, 64(%rsp)
+movdqa %xmm5, 80(%rsp)
+movdqa %xmm6, 96(%rsp)
+movdqa %xmm7, 112(%rsp)
+blake2b_blocks_x86_nocopy:
+movq 152(%rsp), %r10
+movq 256(%rsp), %rcx
+movq 264(%rsp), %rbp
+movabsq $0x510e527fade682d1, %r11
+movabsq $0x9b05688c2b3e6c1f, %r15
+addq %r10, %rcx
+cmpq %r10, %rcx
+jae blake2b_blocks_x86_nocountercarry
+addq $1, %rbp
+blake2b_blocks_x86_nocountercarry:
+movq %rcx, 256(%rsp)
+movq %rbp, 264(%rsp)
+xorq %r11, %rcx
+xorq %r15, %rbp
+movabsq $0x6a09e667f3bcc908, %r11
+movabsq $0xbb67ae8584caa73b, %rdi
+movabsq $0x3c6ef372fe94f82b, %r10
+movabsq $0xa54ff53a5f1d36f1, %r14
+movq %r11, 296(%rsp)
+movq 272(%rsp), %r11
+movq 280(%rsp), %r15
+movq %rax, 192(%rsp)
+movq %rdx, 200(%rsp)
+movq %r8, 208(%rsp)
+movq %r12, 216(%rsp)
+movq %rbx, 224(%rsp)
+movq %rsi, 232(%rsp)
+movq %r9, 240(%rsp)
+movq %r13, 248(%rsp)
+addq 0(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 16(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 32(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 48(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 8(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 24(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 40(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 56(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 64(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 80(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 96(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 112(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 72(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 88(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 104(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 120(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 112(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 32(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 72(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 104(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 80(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 64(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 120(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 48(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 8(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 0(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 88(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 40(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 96(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 16(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 56(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 24(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 88(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 96(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 40(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 120(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 64(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 0(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 16(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 104(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 80(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 24(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 56(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 72(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 112(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 48(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 8(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 32(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 56(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 24(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 104(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 88(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 72(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 8(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 96(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 112(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 16(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 40(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 32(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 120(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 48(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 80(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 0(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 64(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 72(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 40(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 16(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 80(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 0(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 56(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 32(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 120(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 112(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 88(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 48(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 24(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 8(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 96(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 64(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 104(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 16(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 48(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 0(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 64(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 96(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 80(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 88(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 24(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 32(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 56(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 120(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 8(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 104(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 40(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 112(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 72(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 96(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 8(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 112(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 32(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 40(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 120(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 104(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 80(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 0(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 48(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 72(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 64(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 56(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 24(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 16(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 88(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 104(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 56(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 96(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 24(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 88(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 112(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 8(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 72(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 40(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 120(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 64(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 16(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 0(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 32(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 48(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 80(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 48(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 112(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 88(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 0(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 120(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 72(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 24(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 64(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 96(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 104(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 8(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 80(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 16(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 56(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 32(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 40(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 80(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 64(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 56(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 8(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 16(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 32(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 48(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 40(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 120(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 72(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 24(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 104(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 88(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 112(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 96(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 0(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 0(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 16(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 32(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 48(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 8(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 24(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 40(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 56(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 64(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 80(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 96(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 112(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 72(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 88(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 104(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 120(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+addq 112(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $32, %rcx
+addq 32(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $32, %rbp
+addq 72(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $32, %r11
+addq 104(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $32, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $40, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $40, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $40, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $40, %r13
+addq 80(%rsp), %rax
+addq %rbx, %rax
+xorq %rax, %rcx
+rolq $48, %rcx
+addq 64(%rsp), %rdx
+addq %rsi, %rdx
+xorq %rdx, %rbp
+rolq $48, %rbp
+addq 120(%rsp), %r8
+addq %r9, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %r11
+rolq $48, %r11
+addq 48(%rsp), %r12
+addq %r13, %r12
+xorq %r12, %r15
+rolq $48, %r15
+movq 296(%rsp), %rax
+addq %rcx, %rax
+xorq %rax, %rbx
+movq %rax, 296(%rsp)
+rolq $1, %rbx
+addq %rbp, %rdi
+xorq %rdi, %rsi
+rolq $1, %rsi
+addq %r11, %r10
+xorq %r10, %r9
+rolq $1, %r9
+movq 288(%rsp), %rax
+addq %r15, %r14
+xorq %r14, %r13
+rolq $1, %r13
+addq 8(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $32, %r15
+addq 0(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $32, %rcx
+addq 88(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $32, %rbp
+addq 40(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $32, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $40, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $40, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $40, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $40, %rbx
+addq 96(%rsp), %rax
+addq %rsi, %rax
+xorq %rax, %r15
+rolq $48, %r15
+addq 16(%rsp), %rdx
+addq %r9, %rdx
+xorq %rdx, %rcx
+rolq $48, %rcx
+addq 56(%rsp), %r8
+addq %r13, %r8
+movq %rax, 288(%rsp)
+xorq %r8, %rbp
+rolq $48, %rbp
+addq 24(%rsp), %r12
+addq %rbx, %r12
+xorq %r12, %r11
+rolq $48, %r11
+addq %r15, %r10
+xorq %r10, %rsi
+rolq $1, %rsi
+addq %rcx, %r14
+xorq %r14, %r9
+rolq $1, %r9
+movq 296(%rsp), %rax
+addq %rbp, %rax
+xorq %rax, %r13
+movq %rax, 296(%rsp)
+rolq $1, %r13
+movq 288(%rsp), %rax
+addq %r11, %rdi
+xorq %rdi, %rbx
+rolq $1, %rbx
+xorq 296(%rsp), %rax
+xorq %rdi, %rdx
+xorq %r10, %r8
+xorq %r14, %r12
+xorq %rcx, %rbx
+xorq %rbp, %rsi
+xorq %r11, %r9
+xorq %r15, %r13
+xorq 192(%rsp), %rax
+xorq 200(%rsp), %rdx
+xorq 208(%rsp), %r8
+xorq 216(%rsp), %r12
+xorq 224(%rsp), %rbx
+xorq 232(%rsp), %rsi
+xorq 240(%rsp), %r9
+xorq 248(%rsp), %r13
+movq 128(%rsp), %rcx
+movq 136(%rsp), %rbp
+movq 144(%rsp), %r11
+cmpq $128, %rbp
+jbe blake2b_blocks_x86_done
+addq %r11, %rcx
+subq $128, %rbp
+movq %rcx, 128(%rsp)
+movq %rbp, 136(%rsp)
+jmp blake2b_blocks_x86_mainloop
+blake2b_blocks_x86_done:
+movq 160(%rsp), %rcx
+movq 256(%rsp), %rbp
+movq 264(%rsp), %r11
+movq %rax, 0(%rcx)
+movq %rdx, 8(%rcx)
+movq %r8, 16(%rcx)
+movq %r12, 24(%rcx)
+movq %rbx, 32(%rcx)
+movq %rsi, 40(%rcx)
+movq %r9, 48(%rcx)
+movq %r13, 56(%rcx)
+movq %rbp, 64(%rcx)
+movq %r11, 72(%rcx)
+movq 168(%rsp), %rsp
+popq %r15
+popq %r14
+popq %r13
+popq %r12
+popq %rbp
+popq %rbx
+ret
+FN_END blake2b_blocks_x86