1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
/*-
* Copyright 2015 Google Inc. All Rights Reserved.
* Copyright 2016 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../macro.S"
#include "constants.S"
/*
* Generated by clang-3.8 from siphash avx2 implementation written by
* Jan Wassenberg and Jyrki Alakuijala
*/
SECTION_TEXT
GLOBAL_HIDDEN_FN siphash_avx2
siphash_avx2_local:
.cfi_startproc
## BB#0: ## %entry
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
pushq %rbx
subq $40, %rsp
Ltmp3:
.cfi_offset %rbx, -24
movq %rdx, %rbx
vmovdqu (%rdi), %xmm0
vpxor LCPI0_0(%rip), %xmm0, %xmm1
vpxor LCPI0_1(%rip), %xmm0, %xmm0
vpunpcklqdq %xmm0, %xmm1, %xmm6 ## xmm6 = xmm1[0],xmm0[0]
vpunpckhqdq %xmm0, %xmm1, %xmm7 ## xmm7 = xmm1[1],xmm0[1]
movq %rbx, %rax
andq $-8, %rax
je LBB0_1
## BB#2: ## %for.body.preheader
xorl %ecx, %ecx
vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
.align 4, 0x90
LBB0_3: ## %for.body
## =>This Inner Loop Header: Depth=1
vmovq (%rsi,%rcx), %xmm4 ## xmm4 = mem[0],zero
vpslldq $8, %xmm4, %xmm5 ## xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
vpxor %xmm5, %xmm7, %xmm5
vpaddq %xmm6, %xmm5, %xmm6
vpsllvq %xmm0, %xmm5, %xmm7
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
vpaddq %xmm5, %xmm6, %xmm6
vpsllvq %xmm2, %xmm5, %xmm7
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
vpaddq %xmm5, %xmm6, %xmm6
vpsllvq %xmm0, %xmm5, %xmm7
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
vpaddq %xmm5, %xmm6, %xmm6
vpsllvq %xmm2, %xmm5, %xmm7
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm7
vpshufd $30, %xmm6, %xmm5 ## xmm5 = xmm6[2,3,1,0]
vpxor %xmm5, %xmm4, %xmm6
addq $8, %rcx
cmpq %rax, %rcx
jb LBB0_3
## BB#4: ## %for.end.loopexit
vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
addq %rax, %rsi
jmp LBB0_5
LBB0_1:
vmovdqa %xmm7, -48(%rbp) ## 16-byte Spill
vmovdqa %xmm6, -32(%rbp) ## 16-byte Spill
xorl %eax, %eax
LBB0_5: ## %for.end
movq $0, -16(%rbp)
movq %rbx, %rdx
subq %rax, %rdx
leaq -16(%rbp), %rdi
movq %rdx, %rcx
shrq $2, %rcx
rep; movsl
movq %rdx, %rcx
andq $3, %rcx
rep; movsb
movb %bl, -9(%rbp)
vmovq -16(%rbp), %xmm4 ## xmm4 = mem[0],zero
vpslldq $8, %xmm4, %xmm0 ## xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
vpxor -48(%rbp), %xmm0, %xmm2 ## 16-byte Folded Reload
vpaddq -32(%rbp), %xmm2, %xmm3 ## 16-byte Folded Reload
vmovdqa LCPI0_2(%rip), %xmm0 ## xmm0 = [13,16]
vpsllvq %xmm0, %xmm2, %xmm5
vmovdqa LCPI0_3(%rip), %xmm1 ## xmm1 = [51,48]
vpsrlvq %xmm1, %xmm2, %xmm2
vpor %xmm5, %xmm2, %xmm2
vpxor %xmm3, %xmm2, %xmm5
vpshufd $30, %xmm3, %xmm2 ## xmm2 = xmm3[2,3,1,0]
vpaddq %xmm5, %xmm2, %xmm6
vmovdqa LCPI0_4(%rip), %xmm2 ## xmm2 = [17,21]
vpsllvq %xmm2, %xmm5, %xmm7
vmovdqa LCPI0_5(%rip), %xmm3 ## xmm3 = [47,43]
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
vpaddq %xmm5, %xmm6, %xmm6
vpsllvq %xmm0, %xmm5, %xmm7
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
vpaddq %xmm5, %xmm6, %xmm6
vpsllvq %xmm2, %xmm5, %xmm7
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm7, %xmm5, %xmm5
vpxor %xmm6, %xmm5, %xmm5
vpshufd $30, %xmm6, %xmm6 ## xmm6 = xmm6[2,3,1,0]
movl $255, %eax
vmovq %rax, %xmm7
vpslldq $8, %xmm7, %xmm7 ## xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7]
vpxor %xmm7, %xmm4, %xmm4
vpxor %xmm4, %xmm6, %xmm4
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm0, %xmm5, %xmm6
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm2, %xmm5, %xmm6
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm0, %xmm5, %xmm6
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm2, %xmm5, %xmm6
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm0, %xmm5, %xmm6
vpsrlvq %xmm1, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm2, %xmm5, %xmm6
vpsrlvq %xmm3, %xmm5, %xmm5
vpor %xmm6, %xmm5, %xmm5
vpxor %xmm4, %xmm5, %xmm5
vpshufd $30, %xmm4, %xmm4 ## xmm4 = xmm4[2,3,1,0]
vpaddq %xmm5, %xmm4, %xmm4
vpsllvq %xmm0, %xmm5, %xmm0
vpsrlvq %xmm1, %xmm5, %xmm1
vpor %xmm0, %xmm1, %xmm0
vpxor %xmm4, %xmm0, %xmm0
vpshufd $30, %xmm4, %xmm1 ## xmm1 = xmm4[2,3,1,0]
vpaddq %xmm0, %xmm1, %xmm1
vpsllvq %xmm2, %xmm0, %xmm2
vpsrlvq %xmm3, %xmm0, %xmm0
vpor %xmm2, %xmm0, %xmm0
vpshufd $30, %xmm1, %xmm2 ## xmm2 = xmm1[2,3,1,0]
vpxor %xmm2, %xmm1, %xmm1
vpxor %xmm1, %xmm0, %xmm0
vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
vpxor %xmm1, %xmm0, %xmm0
vmovq %xmm0, %rax
addq $40, %rsp
popq %rbx
popq %rbp
retq
.cfi_endproc
FN_END siphash_avx2
|