aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/golang/snappy/encode_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/golang/snappy/encode_arm64.s')
-rw-r--r--vendor/github.com/golang/snappy/encode_arm64.s81
1 files changed, 37 insertions, 44 deletions
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s
index 1f565ee75f..bf83667d71 100644
--- a/vendor/github.com/golang/snappy/encode_arm64.s
+++ b/vendor/github.com/golang/snappy/encode_arm64.s
@@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
MOVW R3, R4
SUBW $1, R4, R4
- MOVW $60, R2
- CMPW R2, R4
+ CMPW $60, R4
BLT oneByte
- MOVW $256, R2
- CMPW R2, R4
+ CMPW $256, R4
BLT twoBytes
threeBytes:
@@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
loop0:
// for length >= 68 { etc }
- MOVW $68, R2
- CMPW R2, R3
+ CMPW $68, R3
BLT step1
// Emit a length 64 copy, encoded as 3 bytes.
@@ -112,9 +109,8 @@ loop0:
step1:
// if length > 64 { etc }
- MOVD $64, R2
- CMP R2, R3
- BLE step2
+ CMP $64, R3
+ BLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVD $0xee, R2
@@ -125,11 +121,9 @@ step1:
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
- MOVD $12, R2
- CMP R2, R3
+ CMP $12, R3
BGE step3
- MOVW $2048, R2
- CMPW R2, R11
+ CMPW $2048, R11
BGE step3
// Emit the remaining copy, encoded as 2 bytes.
@@ -295,27 +289,24 @@ varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
- // first tableSize elements. Each uint16 element is 2 bytes and each VST1
- // writes 64 bytes, so we can do only tableSize/32 writes instead of the
- // 2048 writes that would zero-initialize all of table's 32768 bytes.
- // This clear could overrun the first tableSize elements, but it won't
- // overrun the allocated stack size.
+ // first tableSize elements. Each uint16 element is 2 bytes and each
+ // iterations writes 64 bytes, so we can do only tableSize/32 writes
+ // instead of the 2048 writes that would zero-initialize all of table's
+ // 32768 bytes. This clear could overrun the first tableSize elements, but
+ // it won't overrun the allocated stack size.
ADD $128, RSP, R17
MOVD R17, R4
// !!! R6 = &src[tableSize]
ADD R6<<1, R17, R6
- // zero the SIMD registers
- VEOR V0.B16, V0.B16, V0.B16
- VEOR V1.B16, V1.B16, V1.B16
- VEOR V2.B16, V2.B16, V2.B16
- VEOR V3.B16, V3.B16, V3.B16
-
memclr:
- VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
- CMP R4, R6
- BHI memclr
+ STP.P (ZR, ZR), 64(R4)
+ STP (ZR, ZR), -48(R4)
+ STP (ZR, ZR), -32(R4)
+ STP (ZR, ZR), -16(R4)
+ CMP R4, R6
+ BHI memclr
// !!! R6 = &src[0]
MOVD R7, R6
@@ -404,8 +395,7 @@ fourByteMatch:
// on inputMargin in encode.go.
MOVD R7, R3
SUB R10, R3, R3
- MOVD $16, R2
- CMP R2, R3
+ CMP $16, R3
BLE emitLiteralFastPath
// ----------------------------------------
@@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
MOVD R3, 24(RSP)
// Finish the "d +=" part of "d += emitLiteral(etc)".
- ADD R3, R8, R8
- MOVD R7, 80(RSP)
- MOVD R8, 88(RSP)
- MOVD R15, 120(RSP)
- CALL runtime·memmove(SB)
- MOVD 64(RSP), R5
- MOVD 72(RSP), R6
- MOVD 80(RSP), R7
- MOVD 88(RSP), R8
- MOVD 96(RSP), R9
- MOVD 120(RSP), R15
- B inner1
+ ADD R3, R8, R8
+ MOVD R7, 80(RSP)
+ MOVD R8, 88(RSP)
+ MOVD R15, 120(RSP)
+ CALL runtime·memmove(SB)
+ MOVD 64(RSP), R5
+ MOVD 72(RSP), R6
+ MOVD 80(RSP), R7
+ MOVD 88(RSP), R8
+ MOVD 96(RSP), R9
+ MOVD 120(RSP), R15
+ ADD $128, RSP, R17
+ MOVW $0xa7bd, R16
+ MOVKW $(0x1e35<<16), R16
+ B inner1
inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
@@ -489,9 +482,9 @@ emitLiteralFastPath:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
- VLD1 0(R10), [V0.B16]
- VST1 [V0.B16], 0(R8)
- ADD R3, R8, R8
+ LDP 0(R10), (R0, R1)
+ STP (R0, R1), 0(R8)
+ ADD R3, R8, R8
inner1:
// for { etc }