You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

asm_arm64.s 8.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build go1.11
  5. // +build !gccgo,!appengine
  6. #include "textflag.h"
  7. #define NUM_ROUNDS 10
  8. // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
  9. TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
  10. MOVD dst+0(FP), R1
  11. MOVD src+24(FP), R2
  12. MOVD src_len+32(FP), R3
  13. MOVD key+48(FP), R4
  14. MOVD nonce+56(FP), R6
  15. MOVD counter+64(FP), R7
  16. MOVD $·constants(SB), R10
  17. MOVD $·incRotMatrix(SB), R11
  18. MOVW (R7), R20
  19. AND $~255, R3, R13
  20. ADD R2, R13, R12 // R12 for block end
  21. AND $255, R3, R13
  22. loop:
  23. MOVD $NUM_ROUNDS, R21
  24. VLD1 (R11), [V30.S4, V31.S4]
  25. // load contants
  26. // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
  27. WORD $0x4D60E940
  28. // load keys
  29. // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
  30. WORD $0x4DFFE884
  31. // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
  32. WORD $0x4DFFE888
  33. SUB $32, R4
  34. // load counter + nonce
  35. // VLD1R (R7), [V12.S4]
  36. WORD $0x4D40C8EC
  37. // VLD3R (R6), [V13.S4, V14.S4, V15.S4]
  38. WORD $0x4D40E8CD
  39. // update counter
  40. VADD V30.S4, V12.S4, V12.S4
  41. chacha:
  42. // V0..V3 += V4..V7
  43. // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
  44. VADD V0.S4, V4.S4, V0.S4
  45. VADD V1.S4, V5.S4, V1.S4
  46. VADD V2.S4, V6.S4, V2.S4
  47. VADD V3.S4, V7.S4, V3.S4
  48. VEOR V12.B16, V0.B16, V12.B16
  49. VEOR V13.B16, V1.B16, V13.B16
  50. VEOR V14.B16, V2.B16, V14.B16
  51. VEOR V15.B16, V3.B16, V15.B16
  52. VREV32 V12.H8, V12.H8
  53. VREV32 V13.H8, V13.H8
  54. VREV32 V14.H8, V14.H8
  55. VREV32 V15.H8, V15.H8
  56. // V8..V11 += V12..V15
  57. // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
  58. VADD V8.S4, V12.S4, V8.S4
  59. VADD V9.S4, V13.S4, V9.S4
  60. VADD V10.S4, V14.S4, V10.S4
  61. VADD V11.S4, V15.S4, V11.S4
  62. VEOR V8.B16, V4.B16, V16.B16
  63. VEOR V9.B16, V5.B16, V17.B16
  64. VEOR V10.B16, V6.B16, V18.B16
  65. VEOR V11.B16, V7.B16, V19.B16
  66. VSHL $12, V16.S4, V4.S4
  67. VSHL $12, V17.S4, V5.S4
  68. VSHL $12, V18.S4, V6.S4
  69. VSHL $12, V19.S4, V7.S4
  70. VSRI $20, V16.S4, V4.S4
  71. VSRI $20, V17.S4, V5.S4
  72. VSRI $20, V18.S4, V6.S4
  73. VSRI $20, V19.S4, V7.S4
  74. // V0..V3 += V4..V7
  75. // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
  76. VADD V0.S4, V4.S4, V0.S4
  77. VADD V1.S4, V5.S4, V1.S4
  78. VADD V2.S4, V6.S4, V2.S4
  79. VADD V3.S4, V7.S4, V3.S4
  80. VEOR V12.B16, V0.B16, V12.B16
  81. VEOR V13.B16, V1.B16, V13.B16
  82. VEOR V14.B16, V2.B16, V14.B16
  83. VEOR V15.B16, V3.B16, V15.B16
  84. VTBL V31.B16, [V12.B16], V12.B16
  85. VTBL V31.B16, [V13.B16], V13.B16
  86. VTBL V31.B16, [V14.B16], V14.B16
  87. VTBL V31.B16, [V15.B16], V15.B16
  88. // V8..V11 += V12..V15
  89. // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
  90. VADD V12.S4, V8.S4, V8.S4
  91. VADD V13.S4, V9.S4, V9.S4
  92. VADD V14.S4, V10.S4, V10.S4
  93. VADD V15.S4, V11.S4, V11.S4
  94. VEOR V8.B16, V4.B16, V16.B16
  95. VEOR V9.B16, V5.B16, V17.B16
  96. VEOR V10.B16, V6.B16, V18.B16
  97. VEOR V11.B16, V7.B16, V19.B16
  98. VSHL $7, V16.S4, V4.S4
  99. VSHL $7, V17.S4, V5.S4
  100. VSHL $7, V18.S4, V6.S4
  101. VSHL $7, V19.S4, V7.S4
  102. VSRI $25, V16.S4, V4.S4
  103. VSRI $25, V17.S4, V5.S4
  104. VSRI $25, V18.S4, V6.S4
  105. VSRI $25, V19.S4, V7.S4
  106. // V0..V3 += V5..V7, V4
  107. // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
  108. VADD V0.S4, V5.S4, V0.S4
  109. VADD V1.S4, V6.S4, V1.S4
  110. VADD V2.S4, V7.S4, V2.S4
  111. VADD V3.S4, V4.S4, V3.S4
  112. VEOR V15.B16, V0.B16, V15.B16
  113. VEOR V12.B16, V1.B16, V12.B16
  114. VEOR V13.B16, V2.B16, V13.B16
  115. VEOR V14.B16, V3.B16, V14.B16
  116. VREV32 V12.H8, V12.H8
  117. VREV32 V13.H8, V13.H8
  118. VREV32 V14.H8, V14.H8
  119. VREV32 V15.H8, V15.H8
  120. // V10 += V15; V5 <<<= ((V10 XOR V5), 12)
  121. // ...
  122. VADD V15.S4, V10.S4, V10.S4
  123. VADD V12.S4, V11.S4, V11.S4
  124. VADD V13.S4, V8.S4, V8.S4
  125. VADD V14.S4, V9.S4, V9.S4
  126. VEOR V10.B16, V5.B16, V16.B16
  127. VEOR V11.B16, V6.B16, V17.B16
  128. VEOR V8.B16, V7.B16, V18.B16
  129. VEOR V9.B16, V4.B16, V19.B16
  130. VSHL $12, V16.S4, V5.S4
  131. VSHL $12, V17.S4, V6.S4
  132. VSHL $12, V18.S4, V7.S4
  133. VSHL $12, V19.S4, V4.S4
  134. VSRI $20, V16.S4, V5.S4
  135. VSRI $20, V17.S4, V6.S4
  136. VSRI $20, V18.S4, V7.S4
  137. VSRI $20, V19.S4, V4.S4
  138. // V0 += V5; V15 <<<= ((V0 XOR V15), 8)
  139. // ...
  140. VADD V5.S4, V0.S4, V0.S4
  141. VADD V6.S4, V1.S4, V1.S4
  142. VADD V7.S4, V2.S4, V2.S4
  143. VADD V4.S4, V3.S4, V3.S4
  144. VEOR V0.B16, V15.B16, V15.B16
  145. VEOR V1.B16, V12.B16, V12.B16
  146. VEOR V2.B16, V13.B16, V13.B16
  147. VEOR V3.B16, V14.B16, V14.B16
  148. VTBL V31.B16, [V12.B16], V12.B16
  149. VTBL V31.B16, [V13.B16], V13.B16
  150. VTBL V31.B16, [V14.B16], V14.B16
  151. VTBL V31.B16, [V15.B16], V15.B16
  152. // V10 += V15; V5 <<<= ((V10 XOR V5), 7)
  153. // ...
  154. VADD V15.S4, V10.S4, V10.S4
  155. VADD V12.S4, V11.S4, V11.S4
  156. VADD V13.S4, V8.S4, V8.S4
  157. VADD V14.S4, V9.S4, V9.S4
  158. VEOR V10.B16, V5.B16, V16.B16
  159. VEOR V11.B16, V6.B16, V17.B16
  160. VEOR V8.B16, V7.B16, V18.B16
  161. VEOR V9.B16, V4.B16, V19.B16
  162. VSHL $7, V16.S4, V5.S4
  163. VSHL $7, V17.S4, V6.S4
  164. VSHL $7, V18.S4, V7.S4
  165. VSHL $7, V19.S4, V4.S4
  166. VSRI $25, V16.S4, V5.S4
  167. VSRI $25, V17.S4, V6.S4
  168. VSRI $25, V18.S4, V7.S4
  169. VSRI $25, V19.S4, V4.S4
  170. SUB $1, R21
  171. CBNZ R21, chacha
  172. // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
  173. WORD $0x4D60E950
  174. // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
  175. WORD $0x4DFFE894
  176. VADD V30.S4, V12.S4, V12.S4
  177. VADD V16.S4, V0.S4, V0.S4
  178. VADD V17.S4, V1.S4, V1.S4
  179. VADD V18.S4, V2.S4, V2.S4
  180. VADD V19.S4, V3.S4, V3.S4
  181. // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
  182. WORD $0x4DFFE898
  183. // restore R4
  184. SUB $32, R4
  185. // load counter + nonce
  186. // VLD1R (R7), [V28.S4]
  187. WORD $0x4D40C8FC
  188. // VLD3R (R6), [V29.S4, V30.S4, V31.S4]
  189. WORD $0x4D40E8DD
  190. VADD V20.S4, V4.S4, V4.S4
  191. VADD V21.S4, V5.S4, V5.S4
  192. VADD V22.S4, V6.S4, V6.S4
  193. VADD V23.S4, V7.S4, V7.S4
  194. VADD V24.S4, V8.S4, V8.S4
  195. VADD V25.S4, V9.S4, V9.S4
  196. VADD V26.S4, V10.S4, V10.S4
  197. VADD V27.S4, V11.S4, V11.S4
  198. VADD V28.S4, V12.S4, V12.S4
  199. VADD V29.S4, V13.S4, V13.S4
  200. VADD V30.S4, V14.S4, V14.S4
  201. VADD V31.S4, V15.S4, V15.S4
  202. VZIP1 V1.S4, V0.S4, V16.S4
  203. VZIP2 V1.S4, V0.S4, V17.S4
  204. VZIP1 V3.S4, V2.S4, V18.S4
  205. VZIP2 V3.S4, V2.S4, V19.S4
  206. VZIP1 V5.S4, V4.S4, V20.S4
  207. VZIP2 V5.S4, V4.S4, V21.S4
  208. VZIP1 V7.S4, V6.S4, V22.S4
  209. VZIP2 V7.S4, V6.S4, V23.S4
  210. VZIP1 V9.S4, V8.S4, V24.S4
  211. VZIP2 V9.S4, V8.S4, V25.S4
  212. VZIP1 V11.S4, V10.S4, V26.S4
  213. VZIP2 V11.S4, V10.S4, V27.S4
  214. VZIP1 V13.S4, V12.S4, V28.S4
  215. VZIP2 V13.S4, V12.S4, V29.S4
  216. VZIP1 V15.S4, V14.S4, V30.S4
  217. VZIP2 V15.S4, V14.S4, V31.S4
  218. VZIP1 V18.D2, V16.D2, V0.D2
  219. VZIP2 V18.D2, V16.D2, V4.D2
  220. VZIP1 V19.D2, V17.D2, V8.D2
  221. VZIP2 V19.D2, V17.D2, V12.D2
  222. VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
  223. VZIP1 V22.D2, V20.D2, V1.D2
  224. VZIP2 V22.D2, V20.D2, V5.D2
  225. VZIP1 V23.D2, V21.D2, V9.D2
  226. VZIP2 V23.D2, V21.D2, V13.D2
  227. VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
  228. VZIP1 V26.D2, V24.D2, V2.D2
  229. VZIP2 V26.D2, V24.D2, V6.D2
  230. VZIP1 V27.D2, V25.D2, V10.D2
  231. VZIP2 V27.D2, V25.D2, V14.D2
  232. VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
  233. VZIP1 V30.D2, V28.D2, V3.D2
  234. VZIP2 V30.D2, V28.D2, V7.D2
  235. VZIP1 V31.D2, V29.D2, V11.D2
  236. VZIP2 V31.D2, V29.D2, V15.D2
  237. VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
  238. VEOR V0.B16, V16.B16, V16.B16
  239. VEOR V1.B16, V17.B16, V17.B16
  240. VEOR V2.B16, V18.B16, V18.B16
  241. VEOR V3.B16, V19.B16, V19.B16
  242. VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
  243. VEOR V4.B16, V20.B16, V20.B16
  244. VEOR V5.B16, V21.B16, V21.B16
  245. VEOR V6.B16, V22.B16, V22.B16
  246. VEOR V7.B16, V23.B16, V23.B16
  247. VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
  248. VEOR V8.B16, V24.B16, V24.B16
  249. VEOR V9.B16, V25.B16, V25.B16
  250. VEOR V10.B16, V26.B16, V26.B16
  251. VEOR V11.B16, V27.B16, V27.B16
  252. VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
  253. VEOR V12.B16, V28.B16, V28.B16
  254. VEOR V13.B16, V29.B16, V29.B16
  255. VEOR V14.B16, V30.B16, V30.B16
  256. VEOR V15.B16, V31.B16, V31.B16
  257. VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
  258. ADD $4, R20
  259. MOVW R20, (R7) // update counter
  260. CMP R2, R12
  261. BGT loop
  262. RET
  263. DATA ·constants+0x00(SB)/4, $0x61707865
  264. DATA ·constants+0x04(SB)/4, $0x3320646e
  265. DATA ·constants+0x08(SB)/4, $0x79622d32
  266. DATA ·constants+0x0c(SB)/4, $0x6b206574
  267. GLOBL ·constants(SB), NOPTR|RODATA, $32
  268. DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
  269. DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
  270. DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
  271. DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
  272. DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
  273. DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
  274. DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
  275. DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
  276. GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32