/*- * Copyright 2015 Google Inc. All Rights Reserved. * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "../macro.S" #include "constants.S" /* * Generated by clang-3.8 from siphash avx2 implementation written by * Jan Wassenberg and Jyrki Alakuijala */ SECTION_TEXT GLOBAL_HIDDEN_FN siphash_avx2 siphash_avx2_local: .cfi_startproc ## BB#0: ## %entry pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp pushq %rbx subq $40, %rsp Ltmp3: .cfi_offset %rbx, -24 movq %rdx, %rbx vmovdqu (%rdi), %xmm0 vpxor LCPI0_0(%rip), %xmm0, %xmm1 vpxor LCPI0_1(%rip), %xmm0, %xmm0 vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0] vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1] movq %rbx, %rax andq $-8, %rax je LBB0_1 ## BB#2: ## %for.body.preheader xorl %ecx, %ecx vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] .align 4, 0x90 LBB0_3: ## %for.body ## =>This Inner Loop Header: Depth=1 vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] vpxor %xmm5, %xmm7, %xmm5 vpaddq %xmm6, %xmm5, %xmm6 vpsllvq %xmm0, %xmm5, %xmm7 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] vpaddq %xmm5, %xmm6, %xmm6 vpsllvq %xmm2, %xmm5, %xmm7 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] vpaddq %xmm5, %xmm6, %xmm6 vpsllvq %xmm0, %xmm5, %xmm7 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] vpaddq %xmm5, %xmm6, %xmm6 vpsllvq %xmm2, %xmm5, %xmm7 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm7 vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0] vpxor %xmm5, %xmm4, %xmm6 addq $8, %rcx cmpq %rax, %rcx jb LBB0_3 ## BB#4: ## %for.end.loopexit vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill addq %rax, %rsi jmp LBB0_5 LBB0_1: vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill xorl %eax, %eax LBB0_5: ## %for.end movq $0, -16(%rbp) movq %rbx, %rdx subq %rax, %rdx leaq -16(%rbp), %rdi movq %rdx, %rcx shrq $2, %rcx rep; movsl movq %rdx, %rcx andq $3, %rcx rep; movsb movb %bl, -9(%rbp) vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16] vpsllvq %xmm0, %xmm2, %xmm5 vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48] vpsrlvq %xmm1, %xmm2, %xmm2 vpor %xmm5, %xmm2, %xmm2 vpxor %xmm3, %xmm2, %xmm5 vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0] vpaddq %xmm5, %xmm2, %xmm6 vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21] vpsllvq %xmm2, %xmm5, %xmm7 vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43] vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] vpaddq %xmm5, %xmm6, %xmm6 vpsllvq %xmm0, %xmm5, %xmm7 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] vpaddq %xmm5, %xmm6, %xmm6 vpsllvq %xmm2, %xmm5, %xmm7 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm7, %xmm5, %xmm5 vpxor %xmm6, %xmm5, %xmm5 vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0] movl $255, %eax vmovq %rax, %xmm7 vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7] vpxor %xmm7, %xmm4, %xmm4 vpxor %xmm4, %xmm6, %xmm4 vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm0, %xmm5, %xmm6 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm2, %xmm5, %xmm6 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm0, %xmm5, %xmm6 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm2, %xmm5, %xmm6 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm0, %xmm5, %xmm6 vpsrlvq %xmm1, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm2, %xmm5, %xmm6 vpsrlvq %xmm3, %xmm5, %xmm5 vpor %xmm6, %xmm5, %xmm5 vpxor %xmm4, %xmm5, %xmm5 vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0] vpaddq %xmm5, %xmm4, %xmm4 vpsllvq %xmm0, %xmm5, %xmm0 vpsrlvq %xmm1, %xmm5, %xmm1 vpor %xmm0, %xmm1, %xmm0 vpxor %xmm4, %xmm0, %xmm0 vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0] vpaddq %xmm0, %xmm1, %xmm1 vpsllvq %xmm2, %xmm0, %xmm2 vpsrlvq %xmm3, %xmm0, %xmm0 vpor %xmm2, %xmm0, %xmm0 vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0] vpxor %xmm2, %xmm1, %xmm1 vpxor %xmm1, %xmm0, %xmm0 vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1] vpxor %xmm1, %xmm0, %xmm0 vmovq %xmm0, %rax addq $40, %rsp popq %rbx popq %rbp retq .cfi_endproc FN_END siphash_avx2