You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

asm_ppc64le.s 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. // Copyright 2019 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Based on CRYPTOGAMS code with the following comment:
  5. // # ====================================================================
  6. // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  7. // # project. The module is, however, dual licensed under OpenSSL and
  8. // # CRYPTOGAMS licenses depending on where you obtain it. For further
  9. // # details see http://www.openssl.org/~appro/cryptogams/.
  10. // # ====================================================================
  11. // Original code can be found at the link below:
  12. // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91e5c39ca79126a4a876d5d8ff
  13. // There are some differences between CRYPTOGAMS code and this one. The round
  14. // loop for "_int" isn't the same as the original. Some adjustments were
  15. // necessary because there are less vector registers available. For example, some
  16. // X variables (r12, r13, r14, and r15) share the same register used by the
  17. // counter. The original code uses ctr to name the counter. Here we use CNT
  18. // because golang uses CTR as the counter register name.
  19. // +build ppc64le,!gccgo,!appengine
  20. #include "textflag.h"
  21. #define OUT R3
  22. #define INP R4
  23. #define LEN R5
  24. #define KEY R6
  25. #define CNT R7
  26. #define TEMP R8
  27. #define X0 R11
  28. #define X1 R12
  29. #define X2 R14
  30. #define X3 R15
  31. #define X4 R16
  32. #define X5 R17
  33. #define X6 R18
  34. #define X7 R19
  35. #define X8 R20
  36. #define X9 R21
  37. #define X10 R22
  38. #define X11 R23
  39. #define X12 R24
  40. #define X13 R25
  41. #define X14 R26
  42. #define X15 R27
  43. #define CON0 X0
  44. #define CON1 X1
  45. #define CON2 X2
  46. #define CON3 X3
  47. #define KEY0 X4
  48. #define KEY1 X5
  49. #define KEY2 X6
  50. #define KEY3 X7
  51. #define KEY4 X8
  52. #define KEY5 X9
  53. #define KEY6 X10
  54. #define KEY7 X11
  55. #define CNT0 X12
  56. #define CNT1 X13
  57. #define CNT2 X14
  58. #define CNT3 X15
  59. #define TMP0 R9
  60. #define TMP1 R10
  61. #define TMP2 R28
  62. #define TMP3 R29
  63. #define CONSTS R8
  64. #define A0 V0
  65. #define B0 V1
  66. #define C0 V2
  67. #define D0 V3
  68. #define A1 V4
  69. #define B1 V5
  70. #define C1 V6
  71. #define D1 V7
  72. #define A2 V8
  73. #define B2 V9
  74. #define C2 V10
  75. #define D2 V11
  76. #define T0 V12
  77. #define T1 V13
  78. #define T2 V14
  79. #define K0 V15
  80. #define K1 V16
  81. #define K2 V17
  82. #define K3 V18
  83. #define K4 V19
  84. #define K5 V20
  85. #define FOUR V21
  86. #define SIXTEEN V22
  87. #define TWENTY4 V23
  88. #define TWENTY V24
  89. #define TWELVE V25
  90. #define TWENTY5 V26
  91. #define SEVEN V27
  92. #define INPPERM V28
  93. #define OUTPERM V29
  94. #define OUTMASK V30
  95. #define DD0 V31
  96. #define DD1 SEVEN
  97. #define DD2 T0
  98. #define DD3 T1
  99. #define DD4 T2
  100. DATA ·consts+0x00(SB)/8, $0x3320646e61707865
  101. DATA ·consts+0x08(SB)/8, $0x6b20657479622d32
  102. DATA ·consts+0x10(SB)/8, $0x0000000000000001
  103. DATA ·consts+0x18(SB)/8, $0x0000000000000000
  104. DATA ·consts+0x20(SB)/8, $0x0000000000000004
  105. DATA ·consts+0x28(SB)/8, $0x0000000000000000
  106. DATA ·consts+0x30(SB)/8, $0x0a0b08090e0f0c0d
  107. DATA ·consts+0x38(SB)/8, $0x0203000106070405
  108. DATA ·consts+0x40(SB)/8, $0x090a0b080d0e0f0c
  109. DATA ·consts+0x48(SB)/8, $0x0102030005060704
  110. GLOBL ·consts(SB), RODATA, $80
  111. //func chaCha20_ctr32_vmx(out, inp *byte, len int, key *[32]byte, counter *[16]byte)
  112. TEXT ·chaCha20_ctr32_vmx(SB),NOSPLIT|NOFRAME,$0
  113. // Load the arguments inside the registers
  114. MOVD out+0(FP), OUT
  115. MOVD inp+8(FP), INP
  116. MOVD len+16(FP), LEN
  117. MOVD key+24(FP), KEY
  118. MOVD counter+32(FP), CNT
  119. MOVD $·consts(SB), CONSTS // point to consts addr
  120. MOVD $16, X0
  121. MOVD $32, X1
  122. MOVD $48, X2
  123. MOVD $64, X3
  124. MOVD $31, X4
  125. MOVD $15, X5
  126. // Load key
  127. LVX (KEY)(R0), K1
  128. LVSR (KEY)(R0), T0
  129. LVX (KEY)(X0), K2
  130. LVX (KEY)(X4), DD0
  131. // Load counter
  132. LVX (CNT)(R0), K3
  133. LVSR (CNT)(R0), T1
  134. LVX (CNT)(X5), DD1
  135. // Load constants
  136. LVX (CONSTS)(R0), K0
  137. LVX (CONSTS)(X0), K5
  138. LVX (CONSTS)(X1), FOUR
  139. LVX (CONSTS)(X2), SIXTEEN
  140. LVX (CONSTS)(X3), TWENTY4
  141. // Align key and counter
  142. VPERM K2, K1, T0, K1
  143. VPERM DD0, K2, T0, K2
  144. VPERM DD1, K3, T1, K3
  145. // Load counter to GPR
  146. MOVWZ 0(CNT), CNT0
  147. MOVWZ 4(CNT), CNT1
  148. MOVWZ 8(CNT), CNT2
  149. MOVWZ 12(CNT), CNT3
  150. // Adjust vectors for the initial state
  151. VADDUWM K3, K5, K3
  152. VADDUWM K3, K5, K4
  153. VADDUWM K4, K5, K5
  154. // Synthesized constants
  155. VSPLTISW $-12, TWENTY
  156. VSPLTISW $12, TWELVE
  157. VSPLTISW $-7, TWENTY5
  158. VXOR T0, T0, T0
  159. VSPLTISW $-1, OUTMASK
  160. LVSR (INP)(R0), INPPERM
  161. LVSL (OUT)(R0), OUTPERM
  162. VPERM OUTMASK, T0, OUTPERM, OUTMASK
  163. loop_outer_vmx:
  164. // Load constant
  165. MOVD $0x61707865, CON0
  166. MOVD $0x3320646e, CON1
  167. MOVD $0x79622d32, CON2
  168. MOVD $0x6b206574, CON3
  169. VOR K0, K0, A0
  170. VOR K0, K0, A1
  171. VOR K0, K0, A2
  172. VOR K1, K1, B0
  173. MOVD $10, TEMP
  174. // Load key to GPR
  175. MOVWZ 0(KEY), X4
  176. MOVWZ 4(KEY), X5
  177. MOVWZ 8(KEY), X6
  178. MOVWZ 12(KEY), X7
  179. VOR K1, K1, B1
  180. VOR K1, K1, B2
  181. MOVWZ 16(KEY), X8
  182. MOVWZ 0(CNT), X12
  183. MOVWZ 20(KEY), X9
  184. MOVWZ 4(CNT), X13
  185. VOR K2, K2, C0
  186. VOR K2, K2, C1
  187. MOVWZ 24(KEY), X10
  188. MOVWZ 8(CNT), X14
  189. VOR K2, K2, C2
  190. VOR K3, K3, D0
  191. MOVWZ 28(KEY), X11
  192. MOVWZ 12(CNT), X15
  193. VOR K4, K4, D1
  194. VOR K5, K5, D2
  195. MOVD X4, TMP0
  196. MOVD X5, TMP1
  197. MOVD X6, TMP2
  198. MOVD X7, TMP3
  199. VSPLTISW $7, SEVEN
  200. MOVD TEMP, CTR
  201. loop_vmx:
  202. // CRYPTOGAMS uses a macro to create a loop using perl. This isn't possible
  203. // using assembly macros. Therefore, the macro expansion result was used
  204. // in order to maintain the algorithm efficiency.
  205. // This loop generates three keystream blocks using VMX instructions and,
  206. // in parallel, one keystream block using scalar instructions.
  207. ADD X4, X0, X0
  208. ADD X5, X1, X1
  209. VADDUWM A0, B0, A0
  210. VADDUWM A1, B1, A1
  211. ADD X6, X2, X2
  212. ADD X7, X3, X3
  213. VADDUWM A2, B2, A2
  214. VXOR D0, A0, D0
  215. XOR X0, X12, X12
  216. XOR X1, X13, X13
  217. VXOR D1, A1, D1
  218. VXOR D2, A2, D2
  219. XOR X2, X14, X14
  220. XOR X3, X15, X15
  221. VPERM D0, D0, SIXTEEN, D0
  222. VPERM D1, D1, SIXTEEN, D1
  223. ROTLW $16, X12, X12
  224. ROTLW $16, X13, X13
  225. VPERM D2, D2, SIXTEEN, D2
  226. VADDUWM C0, D0, C0
  227. ROTLW $16, X14, X14
  228. ROTLW $16, X15, X15
  229. VADDUWM C1, D1, C1
  230. VADDUWM C2, D2, C2
  231. ADD X12, X8, X8
  232. ADD X13, X9, X9
  233. VXOR B0, C0, T0
  234. VXOR B1, C1, T1
  235. ADD X14, X10, X10
  236. ADD X15, X11, X11
  237. VXOR B2, C2, T2
  238. VRLW T0, TWELVE, B0
  239. XOR X8, X4, X4
  240. XOR X9, X5, X5
  241. VRLW T1, TWELVE, B1
  242. VRLW T2, TWELVE, B2
  243. XOR X10, X6, X6
  244. XOR X11, X7, X7
  245. VADDUWM A0, B0, A0
  246. VADDUWM A1, B1, A1
  247. ROTLW $12, X4, X4
  248. ROTLW $12, X5, X5
  249. VADDUWM A2, B2, A2
  250. VXOR D0, A0, D0
  251. ROTLW $12, X6, X6
  252. ROTLW $12, X7, X7
  253. VXOR D1, A1, D1
  254. VXOR D2, A2, D2
  255. ADD X4, X0, X0
  256. ADD X5, X1, X1
  257. VPERM D0, D0, TWENTY4, D0
  258. VPERM D1, D1, TWENTY4, D1
  259. ADD X6, X2, X2
  260. ADD X7, X3, X3
  261. VPERM D2, D2, TWENTY4, D2
  262. VADDUWM C0, D0, C0
  263. XOR X0, X12, X12
  264. XOR X1, X13, X13
  265. VADDUWM C1, D1, C1
  266. VADDUWM C2, D2, C2
  267. XOR X2, X14, X14
  268. XOR X3, X15, X15
  269. VXOR B0, C0, T0
  270. VXOR B1, C1, T1
  271. ROTLW $8, X12, X12
  272. ROTLW $8, X13, X13
  273. VXOR B2, C2, T2
  274. VRLW T0, SEVEN, B0
  275. ROTLW $8, X14, X14
  276. ROTLW $8, X15, X15
  277. VRLW T1, SEVEN, B1
  278. VRLW T2, SEVEN, B2
  279. ADD X12, X8, X8
  280. ADD X13, X9, X9
  281. VSLDOI $8, C0, C0, C0
  282. VSLDOI $8, C1, C1, C1
  283. ADD X14, X10, X10
  284. ADD X15, X11, X11
  285. VSLDOI $8, C2, C2, C2
  286. VSLDOI $12, B0, B0, B0
  287. XOR X8, X4, X4
  288. XOR X9, X5, X5
  289. VSLDOI $12, B1, B1, B1
  290. VSLDOI $12, B2, B2, B2
  291. XOR X10, X6, X6
  292. XOR X11, X7, X7
  293. VSLDOI $4, D0, D0, D0
  294. VSLDOI $4, D1, D1, D1
  295. ROTLW $7, X4, X4
  296. ROTLW $7, X5, X5
  297. VSLDOI $4, D2, D2, D2
  298. VADDUWM A0, B0, A0
  299. ROTLW $7, X6, X6
  300. ROTLW $7, X7, X7
  301. VADDUWM A1, B1, A1
  302. VADDUWM A2, B2, A2
  303. ADD X5, X0, X0
  304. ADD X6, X1, X1
  305. VXOR D0, A0, D0
  306. VXOR D1, A1, D1
  307. ADD X7, X2, X2
  308. ADD X4, X3, X3
  309. VXOR D2, A2, D2
  310. VPERM D0, D0, SIXTEEN, D0
  311. XOR X0, X15, X15
  312. XOR X1, X12, X12
  313. VPERM D1, D1, SIXTEEN, D1
  314. VPERM D2, D2, SIXTEEN, D2
  315. XOR X2, X13, X13
  316. XOR X3, X14, X14
  317. VADDUWM C0, D0, C0
  318. VADDUWM C1, D1, C1
  319. ROTLW $16, X15, X15
  320. ROTLW $16, X12, X12
  321. VADDUWM C2, D2, C2
  322. VXOR B0, C0, T0
  323. ROTLW $16, X13, X13
  324. ROTLW $16, X14, X14
  325. VXOR B1, C1, T1
  326. VXOR B2, C2, T2
  327. ADD X15, X10, X10
  328. ADD X12, X11, X11
  329. VRLW T0, TWELVE, B0
  330. VRLW T1, TWELVE, B1
  331. ADD X13, X8, X8
  332. ADD X14, X9, X9
  333. VRLW T2, TWELVE, B2
  334. VADDUWM A0, B0, A0
  335. XOR X10, X5, X5
  336. XOR X11, X6, X6
  337. VADDUWM A1, B1, A1
  338. VADDUWM A2, B2, A2
  339. XOR X8, X7, X7
  340. XOR X9, X4, X4
  341. VXOR D0, A0, D0
  342. VXOR D1, A1, D1
  343. ROTLW $12, X5, X5
  344. ROTLW $12, X6, X6
  345. VXOR D2, A2, D2
  346. VPERM D0, D0, TWENTY4, D0
  347. ROTLW $12, X7, X7
  348. ROTLW $12, X4, X4
  349. VPERM D1, D1, TWENTY4, D1
  350. VPERM D2, D2, TWENTY4, D2
  351. ADD X5, X0, X0
  352. ADD X6, X1, X1
  353. VADDUWM C0, D0, C0
  354. VADDUWM C1, D1, C1
  355. ADD X7, X2, X2
  356. ADD X4, X3, X3
  357. VADDUWM C2, D2, C2
  358. VXOR B0, C0, T0
  359. XOR X0, X15, X15
  360. XOR X1, X12, X12
  361. VXOR B1, C1, T1
  362. VXOR B2, C2, T2
  363. XOR X2, X13, X13
  364. XOR X3, X14, X14
  365. VRLW T0, SEVEN, B0
  366. VRLW T1, SEVEN, B1
  367. ROTLW $8, X15, X15
  368. ROTLW $8, X12, X12
  369. VRLW T2, SEVEN, B2
  370. VSLDOI $8, C0, C0, C0
  371. ROTLW $8, X13, X13
  372. ROTLW $8, X14, X14
  373. VSLDOI $8, C1, C1, C1
  374. VSLDOI $8, C2, C2, C2
  375. ADD X15, X10, X10
  376. ADD X12, X11, X11
  377. VSLDOI $4, B0, B0, B0
  378. VSLDOI $4, B1, B1, B1
  379. ADD X13, X8, X8
  380. ADD X14, X9, X9
  381. VSLDOI $4, B2, B2, B2
  382. VSLDOI $12, D0, D0, D0
  383. XOR X10, X5, X5
  384. XOR X11, X6, X6
  385. VSLDOI $12, D1, D1, D1
  386. VSLDOI $12, D2, D2, D2
  387. XOR X8, X7, X7
  388. XOR X9, X4, X4
  389. ROTLW $7, X5, X5
  390. ROTLW $7, X6, X6
  391. ROTLW $7, X7, X7
  392. ROTLW $7, X4, X4
  393. BC 0x10, 0, loop_vmx
  394. SUB $256, LEN, LEN
  395. // Accumulate key block
  396. ADD $0x61707865, X0, X0
  397. ADD $0x3320646e, X1, X1
  398. ADD $0x79622d32, X2, X2
  399. ADD $0x6b206574, X3, X3
  400. ADD TMP0, X4, X4
  401. ADD TMP1, X5, X5
  402. ADD TMP2, X6, X6
  403. ADD TMP3, X7, X7
  404. MOVWZ 16(KEY), TMP0
  405. MOVWZ 20(KEY), TMP1
  406. MOVWZ 24(KEY), TMP2
  407. MOVWZ 28(KEY), TMP3
  408. ADD TMP0, X8, X8
  409. ADD TMP1, X9, X9
  410. ADD TMP2, X10, X10
  411. ADD TMP3, X11, X11
  412. MOVWZ 12(CNT), TMP0
  413. MOVWZ 8(CNT), TMP1
  414. MOVWZ 4(CNT), TMP2
  415. MOVWZ 0(CNT), TEMP
  416. ADD TMP0, X15, X15
  417. ADD TMP1, X14, X14
  418. ADD TMP2, X13, X13
  419. ADD TEMP, X12, X12
  420. // Accumulate key block
  421. VADDUWM A0, K0, A0
  422. VADDUWM A1, K0, A1
  423. VADDUWM A2, K0, A2
  424. VADDUWM B0, K1, B0
  425. VADDUWM B1, K1, B1
  426. VADDUWM B2, K1, B2
  427. VADDUWM C0, K2, C0
  428. VADDUWM C1, K2, C1
  429. VADDUWM C2, K2, C2
  430. VADDUWM D0, K3, D0
  431. VADDUWM D1, K4, D1
  432. VADDUWM D2, K5, D2
  433. // Increment counter
  434. ADD $4, TEMP, TEMP
  435. MOVW TEMP, 0(CNT)
  436. VADDUWM K3, FOUR, K3
  437. VADDUWM K4, FOUR, K4
  438. VADDUWM K5, FOUR, K5
  439. // XOR the input slice (INP) with the keystream, which is stored in GPRs (X0-X3).
  440. // Load input (aligned or not)
  441. MOVWZ 0(INP), TMP0
  442. MOVWZ 4(INP), TMP1
  443. MOVWZ 8(INP), TMP2
  444. MOVWZ 12(INP), TMP3
  445. // XOR with input
  446. XOR TMP0, X0, X0
  447. XOR TMP1, X1, X1
  448. XOR TMP2, X2, X2
  449. XOR TMP3, X3, X3
  450. MOVWZ 16(INP), TMP0
  451. MOVWZ 20(INP), TMP1
  452. MOVWZ 24(INP), TMP2
  453. MOVWZ 28(INP), TMP3
  454. XOR TMP0, X4, X4
  455. XOR TMP1, X5, X5
  456. XOR TMP2, X6, X6
  457. XOR TMP3, X7, X7
  458. MOVWZ 32(INP), TMP0
  459. MOVWZ 36(INP), TMP1
  460. MOVWZ 40(INP), TMP2
  461. MOVWZ 44(INP), TMP3
  462. XOR TMP0, X8, X8
  463. XOR TMP1, X9, X9
  464. XOR TMP2, X10, X10
  465. XOR TMP3, X11, X11
  466. MOVWZ 48(INP), TMP0
  467. MOVWZ 52(INP), TMP1
  468. MOVWZ 56(INP), TMP2
  469. MOVWZ 60(INP), TMP3
  470. XOR TMP0, X12, X12
  471. XOR TMP1, X13, X13
  472. XOR TMP2, X14, X14
  473. XOR TMP3, X15, X15
  474. // Store output (aligned or not)
  475. MOVW X0, 0(OUT)
  476. MOVW X1, 4(OUT)
  477. MOVW X2, 8(OUT)
  478. MOVW X3, 12(OUT)
  479. ADD $64, INP, INP // INP points to the end of the slice for the alignment code below
  480. MOVW X4, 16(OUT)
  481. MOVD $16, TMP0
  482. MOVW X5, 20(OUT)
  483. MOVD $32, TMP1
  484. MOVW X6, 24(OUT)
  485. MOVD $48, TMP2
  486. MOVW X7, 28(OUT)
  487. MOVD $64, TMP3
  488. MOVW X8, 32(OUT)
  489. MOVW X9, 36(OUT)
  490. MOVW X10, 40(OUT)
  491. MOVW X11, 44(OUT)
  492. MOVW X12, 48(OUT)
  493. MOVW X13, 52(OUT)
  494. MOVW X14, 56(OUT)
  495. MOVW X15, 60(OUT)
  496. ADD $64, OUT, OUT
  497. // Load input
  498. LVX (INP)(R0), DD0
  499. LVX (INP)(TMP0), DD1
  500. LVX (INP)(TMP1), DD2
  501. LVX (INP)(TMP2), DD3
  502. LVX (INP)(TMP3), DD4
  503. ADD $64, INP, INP
  504. VPERM DD1, DD0, INPPERM, DD0 // Align input
  505. VPERM DD2, DD1, INPPERM, DD1
  506. VPERM DD3, DD2, INPPERM, DD2
  507. VPERM DD4, DD3, INPPERM, DD3
  508. VXOR A0, DD0, A0 // XOR with input
  509. VXOR B0, DD1, B0
  510. LVX (INP)(TMP0), DD1 // Keep loading input
  511. VXOR C0, DD2, C0
  512. LVX (INP)(TMP1), DD2
  513. VXOR D0, DD3, D0
  514. LVX (INP)(TMP2), DD3
  515. LVX (INP)(TMP3), DD0
  516. ADD $64, INP, INP
  517. MOVD $63, TMP3 // 63 is not a typo
  518. VPERM A0, A0, OUTPERM, A0
  519. VPERM B0, B0, OUTPERM, B0
  520. VPERM C0, C0, OUTPERM, C0
  521. VPERM D0, D0, OUTPERM, D0
  522. VPERM DD1, DD4, INPPERM, DD4 // Align input
  523. VPERM DD2, DD1, INPPERM, DD1
  524. VPERM DD3, DD2, INPPERM, DD2
  525. VPERM DD0, DD3, INPPERM, DD3
  526. VXOR A1, DD4, A1
  527. VXOR B1, DD1, B1
  528. LVX (INP)(TMP0), DD1 // Keep loading
  529. VXOR C1, DD2, C1
  530. LVX (INP)(TMP1), DD2
  531. VXOR D1, DD3, D1
  532. LVX (INP)(TMP2), DD3
  533. // Note that the LVX address is always rounded down to the nearest 16-byte
  534. // boundary, and that it always points to at most 15 bytes beyond the end of
  535. // the slice, so we cannot cross a page boundary.
  536. LVX (INP)(TMP3), DD4 // Redundant in aligned case.
  537. ADD $64, INP, INP
  538. VPERM A1, A1, OUTPERM, A1 // Pre-misalign output
  539. VPERM B1, B1, OUTPERM, B1
  540. VPERM C1, C1, OUTPERM, C1
  541. VPERM D1, D1, OUTPERM, D1
  542. VPERM DD1, DD0, INPPERM, DD0 // Align Input
  543. VPERM DD2, DD1, INPPERM, DD1
  544. VPERM DD3, DD2, INPPERM, DD2
  545. VPERM DD4, DD3, INPPERM, DD3
  546. VXOR A2, DD0, A2
  547. VXOR B2, DD1, B2
  548. VXOR C2, DD2, C2
  549. VXOR D2, DD3, D2
  550. VPERM A2, A2, OUTPERM, A2
  551. VPERM B2, B2, OUTPERM, B2
  552. VPERM C2, C2, OUTPERM, C2
  553. VPERM D2, D2, OUTPERM, D2
  554. ANDCC $15, OUT, X1 // Is out aligned?
  555. MOVD OUT, X0
  556. VSEL A0, B0, OUTMASK, DD0 // Collect pre-misaligned output
  557. VSEL B0, C0, OUTMASK, DD1
  558. VSEL C0, D0, OUTMASK, DD2
  559. VSEL D0, A1, OUTMASK, DD3
  560. VSEL A1, B1, OUTMASK, B0
  561. VSEL B1, C1, OUTMASK, C0
  562. VSEL C1, D1, OUTMASK, D0
  563. VSEL D1, A2, OUTMASK, A1
  564. VSEL A2, B2, OUTMASK, B1
  565. VSEL B2, C2, OUTMASK, C1
  566. VSEL C2, D2, OUTMASK, D1
  567. STVX DD0, (OUT+TMP0)
  568. STVX DD1, (OUT+TMP1)
  569. STVX DD2, (OUT+TMP2)
  570. ADD $64, OUT, OUT
  571. STVX DD3, (OUT+R0)
  572. STVX B0, (OUT+TMP0)
  573. STVX C0, (OUT+TMP1)
  574. STVX D0, (OUT+TMP2)
  575. ADD $64, OUT, OUT
  576. STVX A1, (OUT+R0)
  577. STVX B1, (OUT+TMP0)
  578. STVX C1, (OUT+TMP1)
  579. STVX D1, (OUT+TMP2)
  580. ADD $64, OUT, OUT
  581. BEQ aligned_vmx
  582. SUB X1, OUT, X2 // in misaligned case edges
  583. MOVD $0, X3 // are written byte-by-byte
  584. unaligned_tail_vmx:
  585. STVEBX D2, (X2+X3)
  586. ADD $1, X3, X3
  587. CMPW X3, X1
  588. BNE unaligned_tail_vmx
  589. SUB X1, X0, X2
  590. unaligned_head_vmx:
  591. STVEBX A0, (X2+X1)
  592. CMPW X1, $15
  593. ADD $1, X1, X1
  594. BNE unaligned_head_vmx
  595. CMPU LEN, $255 // done with 256-byte block yet?
  596. BGT loop_outer_vmx
  597. JMP done_vmx
  598. aligned_vmx:
  599. STVX A0, (X0+R0)
  600. CMPU LEN, $255 // done with 256-byte block yet?
  601. BGT loop_outer_vmx
  602. done_vmx:
  603. RET