From d83b9e01d10303a34ddc45a2751e028ec4b8a0f4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 3 Mar 2016 10:37:47 +0000 Subject: [PATCH] [Feature] Add AVX2 version of siphash This version has been taken from the google code at https://github.com/google/highwayhash/ The final code is obtained by compiling c++ using clang-3.8 compiler. --- src/libcryptobox/CMakeLists.txt | 1 + src/libcryptobox/siphash/avx2.S | 201 +++++++++++++++++++++++++++ src/libcryptobox/siphash/constants.S | 26 ++++ src/libcryptobox/siphash/siphash.c | 10 +- 4 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 src/libcryptobox/siphash/avx2.S diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt index b3defe63f..8318f3ed9 100644 --- a/src/libcryptobox/CMakeLists.txt +++ b/src/libcryptobox/CMakeLists.txt @@ -67,6 +67,7 @@ ENDIF() IF(HAVE_AVX2) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx2.S) SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx2.S) + SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/avx2.S) ENDIF(HAVE_AVX2) IF(HAVE_AVX) SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx.S) diff --git a/src/libcryptobox/siphash/avx2.S b/src/libcryptobox/siphash/avx2.S new file mode 100644 index 000000000..995fc7636 --- /dev/null +++ b/src/libcryptobox/siphash/avx2.S @@ -0,0 +1,201 @@ +/*- + * Copyright 2015 Google Inc. All Rights Reserved. + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../macro.S" +#include "constants.S" + +/* + * Generated by clang-3.8 from siphash avx2 implementation written by + * Jan Wassenberg and Jyrki Alakuijala + */ + +SECTION_TEXT + +GLOBAL_HIDDEN_FN siphash_avx2 +siphash_avx2_local: + .cfi_startproc +## BB#0: ## %entry + pushq %rbp +Ltmp0: + .cfi_def_cfa_offset 16 +Ltmp1: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +Ltmp2: + .cfi_def_cfa_register %rbp + pushq %rbx + subq $40, %rsp +Ltmp3: + .cfi_offset %rbx, -24 + movq %rdx, %rbx + vmovdqu (%rdi), %xmm0 + vpxor LCPI0_0(%rip), %xmm0, %xmm1 + vpxor LCPI0_1(%rip), %xmm0, %xmm0 + vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0] + vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1] + movq %rbx, %rax + andq $-8, %rax + je LBB0_1 +## BB#2: ## %for.body.preheader + xorl %ecx, %ecx + vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] + vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] + vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] + vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] + .align 4, 0x90 +LBB0_3: ## %for.body + ## =>This Inner Loop Header: Depth=1 + vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero + vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] + vpxor %xmm5, %xmm7, %xmm5 + vpaddq %xmm6, %xmm5, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm7 + vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0] + vpxor %xmm5, %xmm4, %xmm6 + addq $8, %rcx + cmpq %rax, %rcx + jb LBB0_3 +## BB#4: ## %for.end.loopexit + vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill + vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill + addq %rax, %rsi + jmp LBB0_5 +LBB0_1: + vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill + vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill + xorl %eax, %eax +LBB0_5: ## %for.end + movq $0, -16(%rbp) + movq %rbx, %rdx + subq %rax, %rdx + leaq -16(%rbp), %rdi + callq _memcpy + movb %bl, -9(%rbp) + vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero + vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] + vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload + vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload + vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] + vpsllvq %xmm0, %xmm2, %xmm5 + vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] + vpsrlvq %xmm1, %xmm2, %xmm2 + vpor %xmm5, %xmm2, %xmm2 + vpxor %xmm3, %xmm2, %xmm5 + vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0] + vpaddq %xmm5, %xmm2, %xmm6 + vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] + vpsllvq %xmm2, %xmm5, %xmm7 + vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + movl $255, %eax + vmovq %rax, %xmm7 + vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7] + vpxor %xmm7, %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm4 + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm0 + vpsrlvq %xmm1, %xmm5, %xmm1 + vpor %xmm0, %xmm1, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0] + vpaddq %xmm0, %xmm1, %xmm1 + vpsllvq %xmm2, %xmm0, %xmm2 + vpsrlvq %xmm3, %xmm0, %xmm0 + vpor %xmm2, %xmm0, %xmm0 + vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0] + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1] + vpxor %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %rax + addq $40, %rsp + popq %rbx + popq %rbp + retq + .cfi_endproc +FN_END siphash_avx2 diff --git a/src/libcryptobox/siphash/constants.S b/src/libcryptobox/siphash/constants.S index 8c1a90cbf..0fb27c75e 100644 --- a/src/libcryptobox/siphash/constants.S +++ b/src/libcryptobox/siphash/constants.S @@ -15,3 +15,29 @@ SECTION_RODATA .LC3: .quad 0 .quad 255 + .align 16 +/* For AVX 2 */ +LCPI0_0: + .quad 8317987319222330741 ## 0x736f6d6570736575 + .quad 7237128888997146477 ## 0x646f72616e646f6d + .align 16 +LCPI0_1: + .quad 7816392313619706465 ## 0x6c7967656e657261 + .quad 8387220255154660723 ## 0x7465646279746573 + .align 16 +LCPI0_2: + .quad 13 ## 0xd + .quad 16 ## 0x10 + .align 16 +LCPI0_3: + .quad 51 ## 0x33 + .quad 48 ## 0x30 + .align 16 +LCPI0_4: + .quad 17 ## 0x11 + .quad 21 ## 0x15 + .align 16 +LCPI0_5: + .quad 47 ## 0x2f + .quad 43 ## 0x2b + .align 16 diff --git a/src/libcryptobox/siphash/siphash.c b/src/libcryptobox/siphash/siphash.c index 93e0689bd..8b488d861 100644 --- a/src/libcryptobox/siphash/siphash.c +++ b/src/libcryptobox/siphash/siphash.c @@ -17,9 +17,10 @@ #include "cryptobox.h" #include "siphash.h" #include "platform_config.h" +#include extern unsigned long cpu_config; -static const size_t test_iters = 1000; +static const size_t test_iters = 100000; typedef struct siphash_impl_t { @@ -42,10 +43,17 @@ SIPHASH_DECLARE(ref) SIPHASH_DECLARE(sse41) #define SIPHASH_SSE41 SIPHASH_IMPL(CPUID_SSE41, "sse41", sse41) #endif +#if defined(HAVE_AVX2) && defined(__x86_64__) +SIPHASH_DECLARE(avx2) +#define SIPHASH_AVX2 SIPHASH_IMPL(CPUID_AVX2, "avx2", avx2) +#endif /* list implemenations from most optimized to least, with generic as the last entry */ static const siphash_impl_t siphash_list[] = { SIPHASH_GENERIC, +#if defined(SIPHASH_AVX2) + SIPHASH_AVX2, +#endif #if defined(SIPHASH_SSE41) SIPHASH_SSE41, #endif -- 2.39.5