aboutsummaryrefslogtreecommitdiffstats
path: root/src/libcryptobox
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-03-03 10:37:47 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-03-03 10:37:47 +0000
commitd83b9e01d10303a34ddc45a2751e028ec4b8a0f4 (patch)
treed96590f4b8c4a4cc8465aaec5b3a015bf9a8848d /src/libcryptobox
parentc4073ae8c9ccb7b999b7a1b1d9e6507137c63255 (diff)
downloadrspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.tar.gz
rspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.zip
[Feature] Add AVX2 version of siphash
This version has been taken from the google code at https://github.com/google/highwayhash/ The final code is obtained by compiling c++ using clang-3.8 compiler.
Diffstat (limited to 'src/libcryptobox')
-rw-r--r--src/libcryptobox/CMakeLists.txt1
-rw-r--r--src/libcryptobox/siphash/avx2.S201
-rw-r--r--src/libcryptobox/siphash/constants.S26
-rw-r--r--src/libcryptobox/siphash/siphash.c10
4 files changed, 237 insertions, 1 deletions
diff --git a/src/libcryptobox/CMakeLists.txt b/src/libcryptobox/CMakeLists.txt
index b3defe63f..8318f3ed9 100644
--- a/src/libcryptobox/CMakeLists.txt
+++ b/src/libcryptobox/CMakeLists.txt
@@ -67,6 +67,7 @@ ENDIF()
IF(HAVE_AVX2)
SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx2.S)
SET(POLYSRC ${POLYSRC} ${CMAKE_CURRENT_SOURCE_DIR}/poly1305/avx2.S)
+ SET(SIPHASHSRC ${SIPHASHSRC} ${CMAKE_CURRENT_SOURCE_DIR}/siphash/avx2.S)
ENDIF(HAVE_AVX2)
IF(HAVE_AVX)
SET(CHACHASRC ${CHACHASRC} ${CMAKE_CURRENT_SOURCE_DIR}/chacha20/avx.S)
diff --git a/src/libcryptobox/siphash/avx2.S b/src/libcryptobox/siphash/avx2.S
new file mode 100644
index 000000000..995fc7636
--- /dev/null
+++ b/src/libcryptobox/siphash/avx2.S
@@ -0,0 +1,201 @@
+/*-
+ * Copyright 2015 Google Inc. All Rights Reserved.
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../macro.S"
+#include "constants.S"
+
+/*
+ * Generated by clang-3.8 from siphash avx2 implementation written by
+ * Jan Wassenberg and Jyrki Alakuijala
+ */
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN siphash_avx2
+siphash_avx2_local:
+ .cfi_startproc
+## BB#0: ## %entry
+ pushq %rbp
+Ltmp0:
+ .cfi_def_cfa_offset 16
+Ltmp1:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+Ltmp2:
+ .cfi_def_cfa_register %rbp
+ pushq %rbx
+ subq $40, %rsp
+Ltmp3:
+ .cfi_offset %rbx, -24
+ movq %rdx, %rbx
+ vmovdqu (%rdi), %xmm0
+ vpxor LCPI0_0(%rip), %xmm0, %xmm1
+ vpxor LCPI0_1(%rip), %xmm0, %xmm0
+ vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0]
+ vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1]
+ movq %rbx, %rax
+ andq $-8, %rax
+ je LBB0_1
+## BB#2: ## %for.body.preheader
+ xorl %ecx, %ecx
+ vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
+ vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
+ vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
+ vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
+ .align 4, 0x90
+LBB0_3: ## %for.body
+ ## =>This Inner Loop Header: Depth=1
+ vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero
+ vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+ vpxor %xmm5, %xmm7, %xmm5
+ vpaddq %xmm6, %xmm5, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm7
+ vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0]
+ vpxor %xmm5, %xmm4, %xmm6
+ addq $8, %rcx
+ cmpq %rax, %rcx
+ jb LBB0_3
+## BB#4: ## %for.end.loopexit
+ vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
+ vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
+ addq %rax, %rsi
+ jmp LBB0_5
+LBB0_1:
+ vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
+ vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
+ xorl %eax, %eax
+LBB0_5: ## %for.end
+ movq $0, -16(%rbp)
+ movq %rbx, %rdx
+ subq %rax, %rdx
+ leaq -16(%rbp), %rdi
+ callq _memcpy
+ movb %bl, -9(%rbp)
+ vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero
+ vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+ vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload
+ vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload
+ vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
+ vpsllvq %xmm0, %xmm2, %xmm5
+ vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
+ vpsrlvq %xmm1, %xmm2, %xmm2
+ vpor %xmm5, %xmm2, %xmm2
+ vpxor %xmm3, %xmm2, %xmm5
+ vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0]
+ vpaddq %xmm5, %xmm2, %xmm6
+ vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ movl $255, %eax
+ vmovq %rax, %xmm7
+ vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7]
+ vpxor %xmm7, %xmm4, %xmm4
+ vpxor %xmm4, %xmm6, %xmm4
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm0
+ vpsrlvq %xmm1, %xmm5, %xmm1
+ vpor %xmm0, %xmm1, %xmm0
+ vpxor %xmm4, %xmm0, %xmm0
+ vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0]
+ vpaddq %xmm0, %xmm1, %xmm1
+ vpsllvq %xmm2, %xmm0, %xmm2
+ vpsrlvq %xmm3, %xmm0, %xmm0
+ vpor %xmm2, %xmm0, %xmm0
+ vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0]
+ vpxor %xmm2, %xmm1, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
+ vpxor %xmm1, %xmm0, %xmm0
+ vmovq %xmm0, %rax
+ addq $40, %rsp
+ popq %rbx
+ popq %rbp
+ retq
+ .cfi_endproc
+FN_END siphash_avx2
diff --git a/src/libcryptobox/siphash/constants.S b/src/libcryptobox/siphash/constants.S
index 8c1a90cbf..0fb27c75e 100644
--- a/src/libcryptobox/siphash/constants.S
+++ b/src/libcryptobox/siphash/constants.S
@@ -15,3 +15,29 @@ SECTION_RODATA
.LC3:
.quad 0
.quad 255
+ .align 16
+/* For AVX 2 */
+LCPI0_0:
+ .quad 8317987319222330741 ## 0x736f6d6570736575
+ .quad 7237128888997146477 ## 0x646f72616e646f6d
+ .align 16
+LCPI0_1:
+ .quad 7816392313619706465 ## 0x6c7967656e657261
+ .quad 8387220255154660723 ## 0x7465646279746573
+ .align 16
+LCPI0_2:
+ .quad 13 ## 0xd
+ .quad 16 ## 0x10
+ .align 16
+LCPI0_3:
+ .quad 51 ## 0x33
+ .quad 48 ## 0x30
+ .align 16
+LCPI0_4:
+ .quad 17 ## 0x11
+ .quad 21 ## 0x15
+ .align 16
+LCPI0_5:
+ .quad 47 ## 0x2f
+ .quad 43 ## 0x2b
+ .align 16
diff --git a/src/libcryptobox/siphash/siphash.c b/src/libcryptobox/siphash/siphash.c
index 93e0689bd..8b488d861 100644
--- a/src/libcryptobox/siphash/siphash.c
+++ b/src/libcryptobox/siphash/siphash.c
@@ -17,9 +17,10 @@
#include "cryptobox.h"
#include "siphash.h"
#include "platform_config.h"
+#include <stdbool.h>
extern unsigned long cpu_config;
-static const size_t test_iters = 1000;
+static const size_t test_iters = 100000;
typedef struct siphash_impl_t
{
@@ -42,10 +43,17 @@ SIPHASH_DECLARE(ref)
SIPHASH_DECLARE(sse41)
#define SIPHASH_SSE41 SIPHASH_IMPL(CPUID_SSE41, "sse41", sse41)
#endif
+#if defined(HAVE_AVX2) && defined(__x86_64__)
+SIPHASH_DECLARE(avx2)
+#define SIPHASH_AVX2 SIPHASH_IMPL(CPUID_AVX2, "avx2", avx2)
+#endif
/* list implemenations from most optimized to least, with generic as the last entry */
static const siphash_impl_t siphash_list[] = {
SIPHASH_GENERIC,
+#if defined(SIPHASH_AVX2)
+ SIPHASH_AVX2,
+#endif
#if defined(SIPHASH_SSE41)
SIPHASH_SSE41,
#endif