aboutsummaryrefslogtreecommitdiffstats
path: root/src/libcryptobox/siphash/avx2.S
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-03-03 10:37:47 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-03-03 10:37:47 +0000
commitd83b9e01d10303a34ddc45a2751e028ec4b8a0f4 (patch)
treed96590f4b8c4a4cc8465aaec5b3a015bf9a8848d /src/libcryptobox/siphash/avx2.S
parentc4073ae8c9ccb7b999b7a1b1d9e6507137c63255 (diff)
downloadrspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.tar.gz
rspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.zip
[Feature] Add AVX2 version of siphash
This version has been taken from the google code at https://github.com/google/highwayhash/ The final code is obtained by compiling c++ using clang-3.8 compiler.
Diffstat (limited to 'src/libcryptobox/siphash/avx2.S')
-rw-r--r--src/libcryptobox/siphash/avx2.S201
1 files changed, 201 insertions, 0 deletions
diff --git a/src/libcryptobox/siphash/avx2.S b/src/libcryptobox/siphash/avx2.S
new file mode 100644
index 000000000..995fc7636
--- /dev/null
+++ b/src/libcryptobox/siphash/avx2.S
@@ -0,0 +1,201 @@
+/*-
+ * Copyright 2015 Google Inc. All Rights Reserved.
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../macro.S"
+#include "constants.S"
+
+/*
+ * Generated by clang-3.8 from siphash avx2 implementation written by
+ * Jan Wassenberg and Jyrki Alakuijala
+ */
+
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN siphash_avx2
+siphash_avx2_local:
+ .cfi_startproc
+## BB#0: ## %entry
+ pushq %rbp
+Ltmp0:
+ .cfi_def_cfa_offset 16
+Ltmp1:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+Ltmp2:
+ .cfi_def_cfa_register %rbp
+ pushq %rbx
+ subq $40, %rsp
+Ltmp3:
+ .cfi_offset %rbx, -24
+ movq %rdx, %rbx
+ vmovdqu (%rdi), %xmm0
+ vpxor LCPI0_0(%rip), %xmm0, %xmm1
+ vpxor LCPI0_1(%rip), %xmm0, %xmm0
+ vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0]
+ vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1]
+ movq %rbx, %rax
+ andq $-8, %rax
+ je LBB0_1
+## BB#2: ## %for.body.preheader
+ xorl %ecx, %ecx
+ vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
+ vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
+ vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
+ vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
+ .align 4, 0x90
+LBB0_3: ## %for.body
+ ## =>This Inner Loop Header: Depth=1
+ vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero
+ vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+ vpxor %xmm5, %xmm7, %xmm5
+ vpaddq %xmm6, %xmm5, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm7
+ vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0]
+ vpxor %xmm5, %xmm4, %xmm6
+ addq $8, %rcx
+ cmpq %rax, %rcx
+ jb LBB0_3
+## BB#4: ## %for.end.loopexit
+ vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
+ vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
+ addq %rax, %rsi
+ jmp LBB0_5
+LBB0_1:
+ vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
+ vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
+ xorl %eax, %eax
+LBB0_5: ## %for.end
+ movq $0, -16(%rbp)
+ movq %rbx, %rdx
+ subq %rax, %rdx
+ leaq -16(%rbp), %rdi
+ callq _memcpy
+ movb %bl, -9(%rbp)
+ vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero
+ vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+ vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload
+ vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload
+ vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
+ vpsllvq %xmm0, %xmm2, %xmm5
+ vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
+ vpsrlvq %xmm1, %xmm2, %xmm2
+ vpor %xmm5, %xmm2, %xmm2
+ vpxor %xmm3, %xmm2, %xmm5
+ vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0]
+ vpaddq %xmm5, %xmm2, %xmm6
+ vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm0, %xmm5, %xmm7
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ vpaddq %xmm5, %xmm6, %xmm6
+ vpsllvq %xmm2, %xmm5, %xmm7
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm7, %xmm5, %xmm5
+ vpxor %xmm6, %xmm5, %xmm5
+ vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
+ movl $255, %eax
+ vmovq %rax, %xmm7
+ vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7]
+ vpxor %xmm7, %xmm4, %xmm4
+ vpxor %xmm4, %xmm6, %xmm4
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm6
+ vpsrlvq %xmm1, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm2, %xmm5, %xmm6
+ vpsrlvq %xmm3, %xmm5, %xmm5
+ vpor %xmm6, %xmm5, %xmm5
+ vpxor %xmm4, %xmm5, %xmm5
+ vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
+ vpaddq %xmm5, %xmm4, %xmm4
+ vpsllvq %xmm0, %xmm5, %xmm0
+ vpsrlvq %xmm1, %xmm5, %xmm1
+ vpor %xmm0, %xmm1, %xmm0
+ vpxor %xmm4, %xmm0, %xmm0
+ vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0]
+ vpaddq %xmm0, %xmm1, %xmm1
+ vpsllvq %xmm2, %xmm0, %xmm2
+ vpsrlvq %xmm3, %xmm0, %xmm0
+ vpor %xmm2, %xmm0, %xmm0
+ vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0]
+ vpxor %xmm2, %xmm1, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
+ vpxor %xmm1, %xmm0, %xmm0
+ vmovq %xmm0, %rax
+ addq $40, %rsp
+ popq %rbx
+ popq %rbp
+ retq
+ .cfi_endproc
+FN_END siphash_avx2