diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-03-03 10:37:47 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-03-03 10:37:47 +0000 |
commit | d83b9e01d10303a34ddc45a2751e028ec4b8a0f4 (patch) | |
tree | d96590f4b8c4a4cc8465aaec5b3a015bf9a8848d /src/libcryptobox/siphash/avx2.S | |
parent | c4073ae8c9ccb7b999b7a1b1d9e6507137c63255 (diff) | |
download | rspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.tar.gz rspamd-d83b9e01d10303a34ddc45a2751e028ec4b8a0f4.zip |
[Feature] Add AVX2 version of siphash
This version has been taken from the google code at
https://github.com/google/highwayhash/
The final code is obtained by compiling c++ using clang-3.8 compiler.
Diffstat (limited to 'src/libcryptobox/siphash/avx2.S')
-rw-r--r-- | src/libcryptobox/siphash/avx2.S | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/src/libcryptobox/siphash/avx2.S b/src/libcryptobox/siphash/avx2.S new file mode 100644 index 000000000..995fc7636 --- /dev/null +++ b/src/libcryptobox/siphash/avx2.S @@ -0,0 +1,201 @@ +/*- + * Copyright 2015 Google Inc. All Rights Reserved. + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../macro.S" +#include "constants.S" + +/* + * Generated by clang-3.8 from siphash avx2 implementation written by + * Jan Wassenberg and Jyrki Alakuijala + */ + +SECTION_TEXT + +GLOBAL_HIDDEN_FN siphash_avx2 +siphash_avx2_local: + .cfi_startproc +## BB#0: ## %entry + pushq %rbp +Ltmp0: + .cfi_def_cfa_offset 16 +Ltmp1: + .cfi_offset %rbp, -16 + movq %rsp, %rbp +Ltmp2: + .cfi_def_cfa_register %rbp + pushq %rbx + subq $40, %rsp +Ltmp3: + .cfi_offset %rbx, -24 + movq %rdx, %rbx + vmovdqu (%rdi), %xmm0 + vpxor LCPI0_0(%rip), %xmm0, %xmm1 + vpxor LCPI0_1(%rip), %xmm0, %xmm0 + vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0] + vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1] + movq %rbx, %rax + andq $-8, %rax + je LBB0_1 +## BB#2: ## %for.body.preheader + xorl %ecx, %ecx + vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] + vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] + vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] + vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] + .align 4, 0x90 +LBB0_3: ## %for.body + ## =>This Inner Loop Header: Depth=1 + vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero + vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] + vpxor %xmm5, %xmm7, %xmm5 + vpaddq %xmm6, %xmm5, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm7 + vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0] + vpxor %xmm5, %xmm4, %xmm6 + addq $8, %rcx + cmpq %rax, %rcx + jb LBB0_3 +## BB#4: ## %for.end.loopexit + vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill + vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill + addq %rax, %rsi + jmp LBB0_5 +LBB0_1: + vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill + vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill + xorl %eax, %eax +LBB0_5: ## %for.end + movq $0, -16(%rbp) + movq %rbx, %rdx + subq %rax, %rdx + leaq -16(%rbp), %rdi + callq _memcpy + movb %bl, -9(%rbp) + vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero + vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] + vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload + vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload + vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] + vpsllvq %xmm0, %xmm2, %xmm5 + vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] + vpsrlvq %xmm1, %xmm2, %xmm2 + vpor %xmm5, %xmm2, %xmm2 + vpxor %xmm3, %xmm2, %xmm5 + vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0] + vpaddq %xmm5, %xmm2, %xmm6 + vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] + vpsllvq %xmm2, %xmm5, %xmm7 + vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm0, %xmm5, %xmm7 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + vpaddq %xmm5, %xmm6, %xmm6 + vpsllvq %xmm2, %xmm5, %xmm7 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm7, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] + movl $255, %eax + vmovq %rax, %xmm7 + vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7] + vpxor %xmm7, %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm4 + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm6 + vpsrlvq %xmm1, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm2, %xmm5, %xmm6 + vpsrlvq %xmm3, %xmm5, %xmm5 + vpor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] + vpaddq %xmm5, %xmm4, %xmm4 + vpsllvq %xmm0, %xmm5, %xmm0 + vpsrlvq %xmm1, %xmm5, %xmm1 + vpor %xmm0, %xmm1, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0] + vpaddq %xmm0, %xmm1, %xmm1 + vpsllvq %xmm2, %xmm0, %xmm2 + vpsrlvq %xmm3, %xmm0, %xmm0 + vpor %xmm2, %xmm0, %xmm0 + vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0] + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1] + vpxor %xmm1, %xmm0, %xmm0 + vmovq %xmm0, %rax + addq $40, %rsp + popq %rbx + popq %rbp + retq + .cfi_endproc +FN_END siphash_avx2 |