You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

chacha_s390x.s 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build s390x,!gccgo,!appengine
  5. #include "go_asm.h"
  6. #include "textflag.h"
  7. // This is an implementation of the ChaCha20 encryption algorithm as
  8. // specified in RFC 7539. It uses vector instructions to compute
  9. // 4 keystream blocks in parallel (256 bytes) which are then XORed
  10. // with the bytes in the input slice.
  11. GLOBL ·constants<>(SB), RODATA|NOPTR, $32
  12. // BSWAP: swap bytes in each 4-byte element
  13. DATA ·constants<>+0x00(SB)/4, $0x03020100
  14. DATA ·constants<>+0x04(SB)/4, $0x07060504
  15. DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
  16. DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
  17. // J0: [j0, j1, j2, j3]
  18. DATA ·constants<>+0x10(SB)/4, $0x61707865
  19. DATA ·constants<>+0x14(SB)/4, $0x3320646e
  20. DATA ·constants<>+0x18(SB)/4, $0x79622d32
  21. DATA ·constants<>+0x1c(SB)/4, $0x6b206574
  22. // EXRL targets:
  23. TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
  24. MVC $1, (R1), (R8)
  25. RET
  26. TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
  27. MVC $1, (R8), (R9)
  28. RET
  29. #define BSWAP V5
  30. #define J0 V6
  31. #define KEY0 V7
  32. #define KEY1 V8
  33. #define NONCE V9
  34. #define CTR V10
  35. #define M0 V11
  36. #define M1 V12
  37. #define M2 V13
  38. #define M3 V14
  39. #define INC V15
  40. #define X0 V16
  41. #define X1 V17
  42. #define X2 V18
  43. #define X3 V19
  44. #define X4 V20
  45. #define X5 V21
  46. #define X6 V22
  47. #define X7 V23
  48. #define X8 V24
  49. #define X9 V25
  50. #define X10 V26
  51. #define X11 V27
  52. #define X12 V28
  53. #define X13 V29
  54. #define X14 V30
  55. #define X15 V31
  56. #define NUM_ROUNDS 20
  57. #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
  58. VAF a1, a0, a0 \
  59. VAF b1, b0, b0 \
  60. VAF c1, c0, c0 \
  61. VAF d1, d0, d0 \
  62. VX a0, a2, a2 \
  63. VX b0, b2, b2 \
  64. VX c0, c2, c2 \
  65. VX d0, d2, d2 \
  66. VERLLF $16, a2, a2 \
  67. VERLLF $16, b2, b2 \
  68. VERLLF $16, c2, c2 \
  69. VERLLF $16, d2, d2 \
  70. VAF a2, a3, a3 \
  71. VAF b2, b3, b3 \
  72. VAF c2, c3, c3 \
  73. VAF d2, d3, d3 \
  74. VX a3, a1, a1 \
  75. VX b3, b1, b1 \
  76. VX c3, c1, c1 \
  77. VX d3, d1, d1 \
  78. VERLLF $12, a1, a1 \
  79. VERLLF $12, b1, b1 \
  80. VERLLF $12, c1, c1 \
  81. VERLLF $12, d1, d1 \
  82. VAF a1, a0, a0 \
  83. VAF b1, b0, b0 \
  84. VAF c1, c0, c0 \
  85. VAF d1, d0, d0 \
  86. VX a0, a2, a2 \
  87. VX b0, b2, b2 \
  88. VX c0, c2, c2 \
  89. VX d0, d2, d2 \
  90. VERLLF $8, a2, a2 \
  91. VERLLF $8, b2, b2 \
  92. VERLLF $8, c2, c2 \
  93. VERLLF $8, d2, d2 \
  94. VAF a2, a3, a3 \
  95. VAF b2, b3, b3 \
  96. VAF c2, c3, c3 \
  97. VAF d2, d3, d3 \
  98. VX a3, a1, a1 \
  99. VX b3, b1, b1 \
  100. VX c3, c1, c1 \
  101. VX d3, d1, d1 \
  102. VERLLF $7, a1, a1 \
  103. VERLLF $7, b1, b1 \
  104. VERLLF $7, c1, c1 \
  105. VERLLF $7, d1, d1
  106. #define PERMUTE(mask, v0, v1, v2, v3) \
  107. VPERM v0, v0, mask, v0 \
  108. VPERM v1, v1, mask, v1 \
  109. VPERM v2, v2, mask, v2 \
  110. VPERM v3, v3, mask, v3
  111. #define ADDV(x, v0, v1, v2, v3) \
  112. VAF x, v0, v0 \
  113. VAF x, v1, v1 \
  114. VAF x, v2, v2 \
  115. VAF x, v3, v3
  116. #define XORV(off, dst, src, v0, v1, v2, v3) \
  117. VLM off(src), M0, M3 \
  118. PERMUTE(BSWAP, v0, v1, v2, v3) \
  119. VX v0, M0, M0 \
  120. VX v1, M1, M1 \
  121. VX v2, M2, M2 \
  122. VX v3, M3, M3 \
  123. VSTM M0, M3, off(dst)
  124. #define SHUFFLE(a, b, c, d, t, u, v, w) \
  125. VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
  126. VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
  127. VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
  128. VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
  129. VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
  130. VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
  131. VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
  132. VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
  133. // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
  134. TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
  135. MOVD $·constants<>(SB), R1
  136. MOVD dst+0(FP), R2 // R2=&dst[0]
  137. LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
  138. MOVD key+48(FP), R5 // R5=key
  139. MOVD nonce+56(FP), R6 // R6=nonce
  140. MOVD counter+64(FP), R7 // R7=counter
  141. MOVD buf+72(FP), R8 // R8=buf
  142. MOVD len+80(FP), R9 // R9=len
  143. // load BSWAP and J0
  144. VLM (R1), BSWAP, J0
  145. // set up tail buffer
  146. ADD $-1, R4, R12
  147. MOVBZ R12, R12
  148. CMPUBEQ R12, $255, aligned
  149. MOVD R4, R1
  150. AND $~255, R1
  151. MOVD $(R3)(R1*1), R1
  152. EXRL $·mvcSrcToBuf(SB), R12
  153. MOVD $255, R0
  154. SUB R12, R0
  155. MOVD R0, (R9) // update len
  156. aligned:
  157. // setup
  158. MOVD $95, R0
  159. VLM (R5), KEY0, KEY1
  160. VLL R0, (R6), NONCE
  161. VZERO M0
  162. VLEIB $7, $32, M0
  163. VSRLB M0, NONCE, NONCE
  164. // initialize counter values
  165. VLREPF (R7), CTR
  166. VZERO INC
  167. VLEIF $1, $1, INC
  168. VLEIF $2, $2, INC
  169. VLEIF $3, $3, INC
  170. VAF INC, CTR, CTR
  171. VREPIF $4, INC
  172. chacha:
  173. VREPF $0, J0, X0
  174. VREPF $1, J0, X1
  175. VREPF $2, J0, X2
  176. VREPF $3, J0, X3
  177. VREPF $0, KEY0, X4
  178. VREPF $1, KEY0, X5
  179. VREPF $2, KEY0, X6
  180. VREPF $3, KEY0, X7
  181. VREPF $0, KEY1, X8
  182. VREPF $1, KEY1, X9
  183. VREPF $2, KEY1, X10
  184. VREPF $3, KEY1, X11
  185. VLR CTR, X12
  186. VREPF $1, NONCE, X13
  187. VREPF $2, NONCE, X14
  188. VREPF $3, NONCE, X15
  189. MOVD $(NUM_ROUNDS/2), R1
  190. loop:
  191. ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
  192. ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
  193. ADD $-1, R1
  194. BNE loop
  195. // decrement length
  196. ADD $-256, R4
  197. BLT tail
  198. continue:
  199. // rearrange vectors
  200. SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
  201. ADDV(J0, X0, X1, X2, X3)
  202. SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
  203. ADDV(KEY0, X4, X5, X6, X7)
  204. SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
  205. ADDV(KEY1, X8, X9, X10, X11)
  206. VAF CTR, X12, X12
  207. SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
  208. ADDV(NONCE, X12, X13, X14, X15)
  209. // increment counters
  210. VAF INC, CTR, CTR
  211. // xor keystream with plaintext
  212. XORV(0*64, R2, R3, X0, X4, X8, X12)
  213. XORV(1*64, R2, R3, X1, X5, X9, X13)
  214. XORV(2*64, R2, R3, X2, X6, X10, X14)
  215. XORV(3*64, R2, R3, X3, X7, X11, X15)
  216. // increment pointers
  217. MOVD $256(R2), R2
  218. MOVD $256(R3), R3
  219. CMPBNE R4, $0, chacha
  220. CMPUBEQ R12, $255, return
  221. EXRL $·mvcBufToDst(SB), R12 // len was updated during setup
  222. return:
  223. VSTEF $0, CTR, (R7)
  224. RET
  225. tail:
  226. MOVD R2, R9
  227. MOVD R8, R2
  228. MOVD R8, R3
  229. MOVD $0, R4
  230. JMP continue