1 files changed, 37 insertions, 44 deletions
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s
index 1f565ee75f..bf83667d71 100644
--- a/vendor/github.com/golang/snappy/encode_arm64.s
+++ b/vendor/github.com/golang/snappy/encode_arm64.s
@@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
 	MOVW R3, R4
 	SUBW $1, R4, R4
 
-	MOVW $60, R2
-	CMPW R2, R4
+	CMPW $60, R4
 	BLT  oneByte
-	MOVW $256, R2
-	CMPW R2, R4
+	CMPW $256, R4
 	BLT  twoBytes
 
 threeBytes:
@@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
 
 loop0:
 	// for length >= 68 { etc }
-	MOVW $68, R2
-	CMPW R2, R3
+	CMPW $68, R3
 	BLT  step1
 
 	// Emit a length 64 copy, encoded as 3 bytes.
@@ -112,9 +109,8 @@ loop0:
 
 step1:
 	// if length > 64 { etc }
-	MOVD $64, R2
-	CMP  R2, R3
-	BLE  step2
+	CMP $64, R3
+	BLE step2
 
 	// Emit a length 60 copy, encoded as 3 bytes.
 	MOVD $0xee, R2
@@ -125,11 +121,9 @@ step1:
 
 step2:
 	// if length >= 12 || offset >= 2048 { goto step3 }
-	MOVD $12, R2
-	CMP  R2, R3
+	CMP  $12, R3
 	BGE  step3
-	MOVW $2048, R2
-	CMPW R2, R11
+	CMPW $2048, R11
 	BGE  step3
 
 	// Emit the remaining copy, encoded as 2 bytes.
@@ -295,27 +289,24 @@ varTable:
 	// var table [maxTableSize]uint16
 	//
 	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each VST1
-	// writes 64 bytes, so we can do only tableSize/32 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	// This clear could overrun the first tableSize elements, but it won't
-	// overrun the allocated stack size.
+	// first tableSize elements. Each uint16 element is 2 bytes and each
+	// iterations writes 64 bytes, so we can do only tableSize/32 writes
+	// instead of the 2048 writes that would zero-initialize all of table's
+	// 32768 bytes. This clear could overrun the first tableSize elements, but
+	// it won't overrun the allocated stack size.
 	ADD  $128, RSP, R17
 	MOVD R17, R4
 
 	// !!! R6 = &src[tableSize]
 	ADD R6<<1, R17, R6
 
-	// zero the SIMD registers
-	VEOR V0.B16, V0.B16, V0.B16
-	VEOR V1.B16, V1.B16, V1.B16
-	VEOR V2.B16, V2.B16, V2.B16
-	VEOR V3.B16, V3.B16, V3.B16
-
 memclr:
-	VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
-	CMP    R4, R6
-	BHI    memclr
+	STP.P (ZR, ZR), 64(R4)
+	STP   (ZR, ZR), -48(R4)
+	STP   (ZR, ZR), -32(R4)
+	STP   (ZR, ZR), -16(R4)
+	CMP   R4, R6
+	BHI   memclr
 
 	// !!! R6 = &src[0]
 	MOVD R7, R6
@@ -404,8 +395,7 @@ fourByteMatch:
 	// on inputMargin in encode.go.
 	MOVD R7, R3
 	SUB  R10, R3, R3
-	MOVD $16, R2
-	CMP  R2, R3
+	CMP  $16, R3
 	BLE  emitLiteralFastPath
 
 	// ----------------------------------------
@@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
 	MOVD R3, 24(RSP)
 
 	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADD  R3, R8, R8
-	MOVD R7, 80(RSP)
-	MOVD R8, 88(RSP)
-	MOVD R15, 120(RSP)
-	CALL runtime·memmove(SB)
-	MOVD 64(RSP), R5
-	MOVD 72(RSP), R6
-	MOVD 80(RSP), R7
-	MOVD 88(RSP), R8
-	MOVD 96(RSP), R9
-	MOVD 120(RSP), R15
-	B    inner1
+	ADD   R3, R8, R8
+	MOVD  R7, 80(RSP)
+	MOVD  R8, 88(RSP)
+	MOVD  R15, 120(RSP)
+	CALL  runtime·memmove(SB)
+	MOVD  64(RSP), R5
+	MOVD  72(RSP), R6
+	MOVD  80(RSP), R7
+	MOVD  88(RSP), R8
+	MOVD  96(RSP), R9
+	MOVD  120(RSP), R15
+	ADD   $128, RSP, R17
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+	B     inner1
 
 inlineEmitLiteralEnd:
 	// End inline of the emitLiteral call.
@@ -489,9 +482,9 @@ emitLiteralFastPath:
 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-	VLD1 0(R10), [V0.B16]
-	VST1 [V0.B16], 0(R8)
-	ADD  R3, R8, R8
+	LDP 0(R10), (R0, R1)
+	STP (R0, R1), 0(R8)
+	ADD R3, R8, R8
 
 inner1:
 	// for { etc }