summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
authorThomas Boerger <thomas@webhippie.de>2016-11-03 23:16:01 +0100
committerThomas Boerger <thomas@webhippie.de>2016-11-04 08:43:11 +0100
commit1ebb35b98889ff77299f24d82da426b434b0cca0 (patch)
tree6dcb814d6df4d11c7e7a0ba6da8a6945628e2c5d /vendor/github.com/klauspost
parent78f86abba45cb35018c58b8bd5f4c48a86cc8634 (diff)
downloadgitea-1ebb35b98889ff77299f24d82da426b434b0cca0.tar.gz
gitea-1ebb35b98889ff77299f24d82da426b434b0cca0.zip
Added all required dependencies
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/LICENSE27
-rw-r--r--vendor/github.com/klauspost/compress/flate/copy.go32
-rw-r--r--vendor/github.com/klauspost/compress/flate/crc32_amd64.go41
-rw-r--r--vendor/github.com/klauspost/compress/flate/crc32_amd64.s213
-rw-r--r--vendor/github.com/klauspost/compress/flate/crc32_noasm.go35
-rw-r--r--vendor/github.com/klauspost/compress/flate/deflate.go1351
-rw-r--r--vendor/github.com/klauspost/compress/flate/dict_decoder.go184
-rw-r--r--vendor/github.com/klauspost/compress/flate/gen.go265
-rw-r--r--vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go701
-rw-r--r--vendor/github.com/klauspost/compress/flate/huffman_code.go344
-rw-r--r--vendor/github.com/klauspost/compress/flate/inflate.go846
-rw-r--r--vendor/github.com/klauspost/compress/flate/reverse_bits.go48
-rw-r--r--vendor/github.com/klauspost/compress/flate/snappy.go856
-rw-r--r--vendor/github.com/klauspost/compress/flate/token.go115
-rw-r--r--vendor/github.com/klauspost/compress/gzip/gunzip.go344
-rw-r--r--vendor/github.com/klauspost/compress/gzip/gzip.go251
-rw-r--r--vendor/github.com/klauspost/cpuid/LICENSE22
-rw-r--r--vendor/github.com/klauspost/cpuid/README.md145
-rw-r--r--vendor/github.com/klauspost/cpuid/cpuid.go1022
-rw-r--r--vendor/github.com/klauspost/cpuid/cpuid_386.s42
-rw-r--r--vendor/github.com/klauspost/cpuid/cpuid_amd64.s42
-rw-r--r--vendor/github.com/klauspost/cpuid/detect_intel.go17
-rw-r--r--vendor/github.com/klauspost/cpuid/detect_ref.go23
-rw-r--r--vendor/github.com/klauspost/cpuid/generate.go3
-rw-r--r--vendor/github.com/klauspost/cpuid/private-gen.go476
-rw-r--r--vendor/github.com/klauspost/crc32/LICENSE28
-rw-r--r--vendor/github.com/klauspost/crc32/README.md87
-rw-r--r--vendor/github.com/klauspost/crc32/crc32.go207
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_amd64.go230
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_amd64.s319
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_amd64p32.go43
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_amd64p32.s67
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_generic.go89
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_otherarch.go15
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_s390x.go91
-rw-r--r--vendor/github.com/klauspost/crc32/crc32_s390x.s249
36 files changed, 8870 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
new file mode 100644
index 0000000000..7448756763
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/flate/copy.go b/vendor/github.com/klauspost/compress/flate/copy.go
new file mode 100644
index 0000000000..a3200a8f49
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/copy.go
@@ -0,0 +1,32 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// forwardCopy is like the built-in copy function except that it always goes
+// forward from the start, even if the dst and src overlap.
+// It is equivalent to:
+// for i := 0; i < n; i++ {
+// mem[dst+i] = mem[src+i]
+// }
+func forwardCopy(mem []byte, dst, src, n int) {
+ if dst <= src {
+ copy(mem[dst:dst+n], mem[src:src+n])
+ return
+ }
+ for {
+ if dst >= src+n {
+ copy(mem[dst:dst+n], mem[src:src+n])
+ return
+ }
+ // There is some forward overlap. The destination
+ // will be filled with a repeated pattern of mem[src:src+k].
+ // We copy one instance of the pattern here, then repeat.
+ // Each time around this loop k will double.
+ k := dst - src
+ copy(mem[dst:dst+k], mem[src:src+k])
+ n -= k
+ dst += k
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
new file mode 100644
index 0000000000..70a6095e60
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
@@ -0,0 +1,41 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package flate
+
+import (
+ "github.com/klauspost/cpuid"
+)
+
+// crc32sse returns a hash for the first 4 bytes of the slice
+// len(a) must be >= 4.
+//go:noescape
+func crc32sse(a []byte) uint32
+
+// crc32sseAll calculates hashes for each 4-byte set in a.
+// dst must be east len(a) - 4 in size.
+// The size is not checked by the assembly.
+//go:noescape
+func crc32sseAll(a []byte, dst []uint32)
+
+// matchLenSSE4 returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+//
+// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
+//
+//go:noescape
+func matchLenSSE4(a, b []byte, max int) int
+
+// histogram accumulates a histogram of b in h.
+// h must be at least 256 entries in length,
+// and must be cleared before calling this function.
+//go:noescape
+func histogram(b []byte, h []int32)
+
+// Detect SSE 4.2 feature.
+func init() {
+ useSSE42 = cpuid.CPU.SSE42()
+}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
new file mode 100644
index 0000000000..2fb2079b9d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
@@ -0,0 +1,213 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+// func crc32sse(a []byte) uint32
+TEXT ·crc32sse(SB), 4, $0
+ MOVQ a+0(FP), R10
+ XORQ BX, BX
+
+ // CRC32 dword (R10), EBX
+ BYTE $0xF2; BYTE $0x41; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0x1a
+
+ MOVL BX, ret+24(FP)
+ RET
+
+// func crc32sseAll(a []byte, dst []uint32)
+TEXT ·crc32sseAll(SB), 4, $0
+ MOVQ a+0(FP), R8 // R8: src
+ MOVQ a_len+8(FP), R10 // input length
+ MOVQ dst+24(FP), R9 // R9: dst
+ SUBQ $4, R10
+ JS end
+ JZ one_crc
+ MOVQ R10, R13
+ SHRQ $2, R10 // len/4
+ ANDQ $3, R13 // len&3
+ XORQ BX, BX
+ ADDQ $1, R13
+ TESTQ R10, R10
+ JZ rem_loop
+
+crc_loop:
+ MOVQ (R8), R11
+ XORQ BX, BX
+ XORQ DX, DX
+ XORQ DI, DI
+ MOVQ R11, R12
+ SHRQ $8, R11
+ MOVQ R12, AX
+ MOVQ R11, CX
+ SHRQ $16, R12
+ SHRQ $16, R11
+ MOVQ R12, SI
+
+ // CRC32 EAX, EBX
+ BYTE $0xF2; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+
+ // CRC32 ECX, EDX
+ BYTE $0xF2; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0xd1
+
+ // CRC32 ESI, EDI
+ BYTE $0xF2; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0xfe
+ MOVL BX, (R9)
+ MOVL DX, 4(R9)
+ MOVL DI, 8(R9)
+
+ XORQ BX, BX
+ MOVL R11, AX
+
+ // CRC32 EAX, EBX
+ BYTE $0xF2; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+ MOVL BX, 12(R9)
+
+ ADDQ $16, R9
+ ADDQ $4, R8
+ XORQ BX, BX
+ SUBQ $1, R10
+ JNZ crc_loop
+
+rem_loop:
+ MOVL (R8), AX
+
+ // CRC32 EAX, EBX
+ BYTE $0xF2; BYTE $0x0f
+ BYTE $0x38; BYTE $0xf1; BYTE $0xd8
+
+ MOVL BX, (R9)
+ ADDQ $4, R9
+ ADDQ $1, R8
+ XORQ BX, BX
+ SUBQ $1, R13
+ JNZ rem_loop
+
+end:
+ RET
+
+one_crc:
+ MOVQ $1, R13
+ XORQ BX, BX
+ JMP rem_loop
+
+// func matchLenSSE4(a, b []byte, max int) int
+TEXT ·matchLenSSE4(SB), 4, $0
+ MOVQ a_base+0(FP), SI
+ MOVQ b_base+24(FP), DI
+ MOVQ DI, DX
+ MOVQ max+48(FP), CX
+
+cmp8:
+ // As long as we are 8 or more bytes before the end of max, we can load and
+ // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+ CMPQ CX, $8
+ JLT cmp1
+ MOVQ (SI), AX
+ MOVQ (DI), BX
+ CMPQ AX, BX
+ JNE bsf
+ ADDQ $8, SI
+ ADDQ $8, DI
+ SUBQ $8, CX
+ JMP cmp8
+
+bsf:
+ // If those 8 bytes were not equal, XOR the two 8 byte values, and return
+ // the index of the first byte that differs. The BSF instruction finds the
+ // least significant 1 bit, the amd64 architecture is little-endian, and
+ // the shift by 3 converts a bit index to a byte index.
+ XORQ AX, BX
+ BSFQ BX, BX
+ SHRQ $3, BX
+ ADDQ BX, DI
+
+ // Subtract off &b[0] to convert from &b[ret] to ret, and return.
+ SUBQ DX, DI
+ MOVQ DI, ret+56(FP)
+ RET
+
+cmp1:
+ // In the slices' tail, compare 1 byte at a time.
+ CMPQ CX, $0
+ JEQ matchLenEnd
+ MOVB (SI), AX
+ MOVB (DI), BX
+ CMPB AX, BX
+ JNE matchLenEnd
+ ADDQ $1, SI
+ ADDQ $1, DI
+ SUBQ $1, CX
+ JMP cmp1
+
+matchLenEnd:
+ // Subtract off &b[0] to convert from &b[ret] to ret, and return.
+ SUBQ DX, DI
+ MOVQ DI, ret+56(FP)
+ RET
+
+// func histogram(b []byte, h []int32)
+TEXT ·histogram(SB), 4, $0
+ MOVQ b+0(FP), SI // SI: &b
+ MOVQ b_len+8(FP), R9 // R9: len(b)
+ MOVQ h+24(FP), DI // DI: Histogram
+ MOVQ R9, R8
+ SHRQ $3, R8
+ JZ hist1
+ XORQ R11, R11
+
+loop_hist8:
+ MOVQ (SI), R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ MOVB R10, R11
+ INCL (DI)(R11*4)
+ SHRQ $8, R10
+
+ INCL (DI)(R10*4)
+
+ ADDQ $8, SI
+ DECQ R8
+ JNZ loop_hist8
+
+hist1:
+ ANDQ $7, R9
+ JZ end_hist
+ XORQ R10, R10
+
+loop_hist1:
+ MOVB (SI), R10
+ INCL (DI)(R10*4)
+ INCQ SI
+ DECQ R9
+ JNZ loop_hist1
+
+end_hist:
+ RET
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
new file mode 100644
index 0000000000..bd98bd598f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
@@ -0,0 +1,35 @@
+//+build !amd64 noasm appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package flate
+
+func init() {
+ useSSE42 = false
+}
+
+// crc32sse should never be called.
+func crc32sse(a []byte) uint32 {
+ panic("no assembler")
+}
+
+// crc32sseAll should never be called.
+func crc32sseAll(a []byte, dst []uint32) {
+ panic("no assembler")
+}
+
+// matchLenSSE4 should never be called.
+func matchLenSSE4(a, b []byte, max int) int {
+ panic("no assembler")
+ return 0
+}
+
+// histogram accumulates a histogram of b in h.
+//
+// len(h) must be >= 256, and h's elements must be all zeroes.
+func histogram(b []byte, h []int32) {
+ h = h[:256]
+ for _, t := range b {
+ h[t]++
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
new file mode 100644
index 0000000000..76e9682f7e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -0,0 +1,1351 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright (c) 2015 Klaus Post
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+ "fmt"
+ "io"
+ "math"
+)
+
+const (
+ NoCompression = 0
+ BestSpeed = 1
+ BestCompression = 9
+ DefaultCompression = -1
+
+ // HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+ // entropy encoding. This mode is useful in compressing data that has
+ // already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+ // that lacks an entropy encoder. Compression gains are achieved when
+ // certain bytes in the input stream occur more frequently than others.
+ //
+ // Note that HuffmanOnly produces a compressed output that is
+ // RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+ // continue to be able to decompress this output.
+ HuffmanOnly = -2
+ ConstantCompression = HuffmanOnly // compatibility alias.
+
+ logWindowSize = 15
+ windowSize = 1 << logWindowSize
+ windowMask = windowSize - 1
+ logMaxOffsetSize = 15 // Standard DEFLATE
+ minMatchLength = 4 // The smallest match that the compressor looks for
+ maxMatchLength = 258 // The longest match for the compressor
+ minOffsetSize = 1 // The shortest offset that makes any sense
+
+ // The maximum number of tokens we put into a single flat block, just too
+ // stop things from getting too large.
+ maxFlateBlockTokens = 1 << 14
+ maxStoreBlockSize = 65535
+ hashBits = 17 // After 17 performance degrades
+ hashSize = 1 << hashBits
+ hashMask = (1 << hashBits) - 1
+ hashShift = (hashBits + minMatchLength - 1) / minMatchLength
+ maxHashOffset = 1 << 24
+
+ skipNever = math.MaxInt32
+)
+
+var useSSE42 bool
+
+type compressionLevel struct {
+ good, lazy, nice, chain, fastSkipHashing, level int
+}
+
+// Compression levels have been rebalanced from zlib deflate defaults
+// to give a bigger spread in speed and compression.
+// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
+var levels = []compressionLevel{
+ {}, // 0
+ // Level 1-4 uses specialized algorithm - values not used
+ {0, 0, 0, 0, 0, 1},
+ {0, 0, 0, 0, 0, 2},
+ {0, 0, 0, 0, 0, 3},
+ {0, 0, 0, 0, 0, 4},
+ // For levels 5-6 we don't bother trying with lazy matches.
+ // Lazy matching is at least 30% slower, with 1.5% increase.
+ {6, 0, 12, 8, 12, 5},
+ {8, 0, 24, 16, 16, 6},
+ // Levels 7-9 use increasingly more lazy matching
+ // and increasingly stringent conditions for "good enough".
+ {8, 8, 24, 16, skipNever, 7},
+ {10, 16, 24, 64, skipNever, 8},
+ {32, 258, 258, 4096, skipNever, 9},
+}
+
+type compressor struct {
+ compressionLevel
+
+ w *huffmanBitWriter
+ bulkHasher func([]byte, []uint32)
+
+ // compression algorithm
+ fill func(*compressor, []byte) int // copy data to window
+ step func(*compressor) // process window
+ sync bool // requesting flush
+
+ // Input hash chains
+ // hashHead[hashValue] contains the largest inputIndex with the specified hash value
+ // If hashHead[hashValue] is within the current window, then
+ // hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+ // with the same hash value.
+ chainHead int
+ hashHead [hashSize]uint32
+ hashPrev [windowSize]uint32
+ hashOffset int
+
+ // input window: unprocessed data is window[index:windowEnd]
+ index int
+ window []byte
+ windowEnd int
+ blockStart int // window index where current tokens start
+ byteAvailable bool // if true, still need to process window[index-1].
+
+ // queued output tokens
+ tokens tokens
+
+ // deflate state
+ length int
+ offset int
+ hash uint32
+ maxInsertIndex int
+ err error
+ ii uint16 // position of last match, intended to overflow to reset.
+
+ snap snappyEnc
+ hashMatch [maxMatchLength + minMatchLength]uint32
+}
+
+func (d *compressor) fillDeflate(b []byte) int {
+ if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+ // shift the window by windowSize
+ copy(d.window[:], d.window[windowSize:2*windowSize])
+ d.index -= windowSize
+ d.windowEnd -= windowSize
+ if d.blockStart >= windowSize {
+ d.blockStart -= windowSize
+ } else {
+ d.blockStart = math.MaxInt32
+ }
+ d.hashOffset += windowSize
+ if d.hashOffset > maxHashOffset {
+ delta := d.hashOffset - 1
+ d.hashOffset -= delta
+ d.chainHead -= delta
+ for i, v := range d.hashPrev {
+ if int(v) > delta {
+ d.hashPrev[i] = uint32(int(v) - delta)
+ } else {
+ d.hashPrev[i] = 0
+ }
+ }
+ for i, v := range d.hashHead {
+ if int(v) > delta {
+ d.hashHead[i] = uint32(int(v) - delta)
+ } else {
+ d.hashHead[i] = 0
+ }
+ }
+ }
+ }
+ n := copy(d.window[d.windowEnd:], b)
+ d.windowEnd += n
+ return n
+}
+
+func (d *compressor) writeBlock(tok tokens, index int, eof bool) error {
+ if index > 0 || eof {
+ var window []byte
+ if d.blockStart <= index {
+ window = d.window[d.blockStart:index]
+ }
+ d.blockStart = index
+ d.w.writeBlock(tok.tokens[:tok.n], eof, window)
+ return d.w.err
+ }
+ return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error {
+ if index > 0 || eof {
+ if d.blockStart <= index {
+ window := d.window[d.blockStart:index]
+ // If we removed less than a 64th of all literals
+ // we huffman compress the block.
+ if int(tok.n) > len(window)-int(tok.n>>6) {
+ d.w.writeBlockHuff(eof, window)
+ } else {
+ // Write a dynamic huffman block.
+ d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window)
+ }
+ } else {
+ d.w.writeBlock(tok.tokens[:tok.n], eof, nil)
+ }
+ d.blockStart = index
+ return d.w.err
+ }
+ return nil
+}
+
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a start/reset.
+func (d *compressor) fillWindow(b []byte) {
+ // Do not fill window if we are in store-only mode,
+ // use constant or Snappy compression.
+ switch d.compressionLevel.level {
+ case 0, 1, 2:
+ return
+ }
+ // If we are given too much, cut it.
+ if len(b) > windowSize {
+ b = b[len(b)-windowSize:]
+ }
+ // Add all to window.
+ n := copy(d.window[d.windowEnd:], b)
+
+ // Calculate 256 hashes at the time (more L1 cache hits)
+ loops := (n + 256 - minMatchLength) / 256
+ for j := 0; j < loops; j++ {
+ startindex := j * 256
+ end := startindex + 256 + minMatchLength - 1
+ if end > n {
+ end = n
+ }
+ tocheck := d.window[startindex:end]
+ dstSize := len(tocheck) - minMatchLength + 1
+
+ if dstSize <= 0 {
+ continue
+ }
+
+ dst := d.hashMatch[:dstSize]
+ d.bulkHasher(tocheck, dst)
+ var newH uint32
+ for i, val := range dst {
+ di := i + startindex
+ newH = val & hashMask
+ // Get previous value with the same hash.
+ // Our chain should point to the previous value.
+ d.hashPrev[di&windowMask] = d.hashHead[newH]
+ // Set the head of the hash chain to us.
+ d.hashHead[newH] = uint32(di + d.hashOffset)
+ }
+ d.hash = newH
+ }
+ // Update window information.
+ d.windowEnd += n
+ d.index = n
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+ minMatchLook := maxMatchLength
+ if lookahead < minMatchLook {
+ minMatchLook = lookahead
+ }
+
+ win := d.window[0 : pos+minMatchLook]
+
+ // We quit when we get a match that's at least nice long
+ nice := len(win) - pos
+ if d.nice < nice {
+ nice = d.nice
+ }
+
+ // If we've got a match that's good enough, only look in 1/4 the chain.
+ tries := d.chain
+ length = prevLength
+ if length >= d.good {
+ tries >>= 2
+ }
+
+ wEnd := win[pos+length]
+ wPos := win[pos:]
+ minIndex := pos - windowSize
+
+ for i := prevHead; tries > 0; tries-- {
+ if wEnd == win[i+length] {
+ n := matchLen(win[i:], wPos, minMatchLook)
+
+ if n > length && (n > minMatchLength || pos-i <= 4096) {
+ length = n
+ offset = pos - i
+ ok = true
+ if n >= nice {
+ // The match is good enough that we don't try to find a better one.
+ break
+ }
+ wEnd = win[pos+n]
+ }
+ }
+ if i == minIndex {
+ // hashPrev[i & windowMask] has already been overwritten, so stop now.
+ break
+ }
+ i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+ if i < minIndex || i < 0 {
+ break
+ }
+ }
+ return
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+ minMatchLook := maxMatchLength
+ if lookahead < minMatchLook {
+ minMatchLook = lookahead
+ }
+
+ win := d.window[0 : pos+minMatchLook]
+
+ // We quit when we get a match that's at least nice long
+ nice := len(win) - pos
+ if d.nice < nice {
+ nice = d.nice
+ }
+
+ // If we've got a match that's good enough, only look in 1/4 the chain.
+ tries := d.chain
+ length = prevLength
+ if length >= d.good {
+ tries >>= 2
+ }
+
+ wEnd := win[pos+length]
+ wPos := win[pos:]
+ minIndex := pos - windowSize
+
+ for i := prevHead; tries > 0; tries-- {
+ if wEnd == win[i+length] {
+ n := matchLenSSE4(win[i:], wPos, minMatchLook)
+
+ if n > length && (n > minMatchLength || pos-i <= 4096) {
+ length = n
+ offset = pos - i
+ ok = true
+ if n >= nice {
+ // The match is good enough that we don't try to find a better one.
+ break
+ }
+ wEnd = win[pos+n]
+ }
+ }
+ if i == minIndex {
+ // hashPrev[i & windowMask] has already been overwritten, so stop now.
+ break
+ }
+ i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+ if i < minIndex || i < 0 {
+ break
+ }
+ }
+ return
+}
+
+func (d *compressor) writeStoredBlock(buf []byte) error {
+ if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
+ return d.w.err
+ }
+ d.w.writeBytes(buf)
+ return d.w.err
+}
+
+const hashmul = 0x1e35a7bd
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+ return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+ if len(b) < minMatchLength {
+ return
+ }
+ hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+ dst[0] = (hb * hashmul) >> (32 - hashBits)
+ end := len(b) - minMatchLength + 1
+ for i := 1; i < end; i++ {
+ hb = (hb << 8) | uint32(b[i+3])
+ dst[i] = (hb * hashmul) >> (32 - hashBits)
+ }
+}
+
+// matchLen returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+func matchLen(a, b []byte, max int) int {
+ a = a[:max]
+ b = b[:len(a)]
+ for i, av := range a {
+ if b[i] != av {
+ return i
+ }
+ }
+ return max
+}
+
+func (d *compressor) initDeflate() {
+ d.window = make([]byte, 2*windowSize)
+ d.hashOffset = 1
+ d.length = minMatchLength - 1
+ d.offset = 0
+ d.byteAvailable = false
+ d.index = 0
+ d.hash = 0
+ d.chainHead = -1
+ d.bulkHasher = bulkHash4
+ if useSSE42 {
+ d.bulkHasher = crc32sseAll
+ }
+}
+
+// Assumes that d.fastSkipHashing != skipNever,
+// otherwise use deflateLazy
+func (d *compressor) deflate() {
+
+ // Sanity enables additional runtime tests.
+ // It's intended to be used during development
+ // to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+ return
+ }
+
+ d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+ if d.index < d.maxInsertIndex {
+ d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+ }
+
+ for {
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ lookahead := d.windowEnd - d.index
+ if lookahead < minMatchLength+maxMatchLength {
+ if !d.sync {
+ return
+ }
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ if lookahead == 0 {
+ if d.tokens.n > 0 {
+ if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ return
+ }
+ }
+ if d.index < d.maxInsertIndex {
+ // Update the hash
+ d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+ ch := d.hashHead[d.hash&hashMask]
+ d.chainHead = int(ch)
+ d.hashPrev[d.index&windowMask] = ch
+ d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
+ }
+ d.length = minMatchLength - 1
+ d.offset = 0
+ minIndex := d.index - windowSize
+ if minIndex < 0 {
+ minIndex = 0
+ }
+
+ if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
+ if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+ d.length = newLength
+ d.offset = newOffset
+ }
+ }
+ if d.length >= minMatchLength {
+ d.ii = 0
+ // There was a match at the previous step, and the current match is
+ // not better. Output the previous match.
+ // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
+ d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
+ d.tokens.n++
+ // Insert in the hash table all strings up to the end of the match.
+ // index and index-1 are already inserted. If there is not enough
+ // lookahead, the last two strings are not inserted into the hash
+ // table.
+ if d.length <= d.fastSkipHashing {
+ var newIndex int
+ newIndex = d.index + d.length
+ // Calculate missing hashes
+ end := newIndex
+ if end > d.maxInsertIndex {
+ end = d.maxInsertIndex
+ }
+ end += minMatchLength - 1
+ startindex := d.index + 1
+ if startindex > d.maxInsertIndex {
+ startindex = d.maxInsertIndex
+ }
+ tocheck := d.window[startindex:end]
+ dstSize := len(tocheck) - minMatchLength + 1
+ if dstSize > 0 {
+ dst := d.hashMatch[:dstSize]
+ bulkHash4(tocheck, dst)
+ var newH uint32
+ for i, val := range dst {
+ di := i + startindex
+ newH = val & hashMask
+ // Get previous value with the same hash.
+ // Our chain should point to the previous value.
+ d.hashPrev[di&windowMask] = d.hashHead[newH]
+ // Set the head of the hash chain to us.
+ d.hashHead[newH] = uint32(di + d.hashOffset)
+ }
+ d.hash = newH
+ }
+ d.index = newIndex
+ } else {
+ // For matches this long, we don't bother inserting each individual
+ // item into the table.
+ d.index += d.length
+ if d.index < d.maxInsertIndex {
+ d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+ }
+ }
+ if d.tokens.n == maxFlateBlockTokens {
+ // The block includes the current character
+ if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ } else {
+ d.ii++
+ end := d.index + int(d.ii>>uint(d.fastSkipHashing)) + 1
+ if end > d.windowEnd {
+ end = d.windowEnd
+ }
+ for i := d.index; i < end; i++ {
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ }
+ d.index = end
+ }
+ }
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazy() {
+ // Sanity enables additional runtime tests.
+ // It's intended to be used during development
+ // to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+ return
+ }
+
+ d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+ if d.index < d.maxInsertIndex {
+ d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+ }
+
+ for {
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ lookahead := d.windowEnd - d.index
+ if lookahead < minMatchLength+maxMatchLength {
+ if !d.sync {
+ return
+ }
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ if lookahead == 0 {
+ // Flush current output block if any.
+ if d.byteAvailable {
+ // There is still one pending token that needs to be flushed
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ d.byteAvailable = false
+ }
+ if d.tokens.n > 0 {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ return
+ }
+ }
+ if d.index < d.maxInsertIndex {
+ // Update the hash
+ d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+ ch := d.hashHead[d.hash&hashMask]
+ d.chainHead = int(ch)
+ d.hashPrev[d.index&windowMask] = ch
+ d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
+ }
+ prevLength := d.length
+ prevOffset := d.offset
+ d.length = minMatchLength - 1
+ d.offset = 0
+ minIndex := d.index - windowSize
+ if minIndex < 0 {
+ minIndex = 0
+ }
+
+ if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+ if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+ d.length = newLength
+ d.offset = newOffset
+ }
+ }
+ if prevLength >= minMatchLength && d.length <= prevLength {
+ // There was a match at the previous step, and the current match is
+ // not better. Output the previous match.
+ d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+ d.tokens.n++
+
+ // Insert in the hash table all strings up to the end of the match.
+ // index and index-1 are already inserted. If there is not enough
+ // lookahead, the last two strings are not inserted into the hash
+ // table.
+ var newIndex int
+ newIndex = d.index + prevLength - 1
+ // Calculate missing hashes
+ end := newIndex
+ if end > d.maxInsertIndex {
+ end = d.maxInsertIndex
+ }
+ end += minMatchLength - 1
+ startindex := d.index + 1
+ if startindex > d.maxInsertIndex {
+ startindex = d.maxInsertIndex
+ }
+ tocheck := d.window[startindex:end]
+ dstSize := len(tocheck) - minMatchLength + 1
+ if dstSize > 0 {
+ dst := d.hashMatch[:dstSize]
+ bulkHash4(tocheck, dst)
+ var newH uint32
+ for i, val := range dst {
+ di := i + startindex
+ newH = val & hashMask
+ // Get previous value with the same hash.
+ // Our chain should point to the previous value.
+ d.hashPrev[di&windowMask] = d.hashHead[newH]
+ // Set the head of the hash chain to us.
+ d.hashHead[newH] = uint32(di + d.hashOffset)
+ }
+ d.hash = newH
+ }
+
+ d.index = newIndex
+ d.byteAvailable = false
+ d.length = minMatchLength - 1
+ if d.tokens.n == maxFlateBlockTokens {
+ // The block includes the current character
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ } else {
+ // Reset, if we got a match this run.
+ if d.length >= minMatchLength {
+ d.ii = 0
+ }
+ // We have a byte waiting. Emit it.
+ if d.byteAvailable {
+ d.ii++
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ d.index++
+
+ // If we have a long run of no matches, skip additional bytes
+ // Resets when d.ii overflows after 64KB.
+ if d.ii > 31 {
+ n := int(d.ii >> 5)
+ for j := 0; j < n; j++ {
+ if d.index >= d.windowEnd-1 {
+ break
+ }
+
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ d.index++
+ }
+ // Flush last byte
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ d.byteAvailable = false
+ // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ }
+ } else {
+ d.index++
+ d.byteAvailable = true
+ }
+ }
+ }
+}
+
+// Assumes that d.fastSkipHashing != skipNever,
+// otherwise use deflateLazySSE
+func (d *compressor) deflateSSE() {
+
+ // Sanity enables additional runtime tests.
+ // It's intended to be used during development
+ // to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+ return
+ }
+
+ d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+ if d.index < d.maxInsertIndex {
+ d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+ }
+
+ for {
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ lookahead := d.windowEnd - d.index
+ if lookahead < minMatchLength+maxMatchLength {
+ if !d.sync {
+ return
+ }
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ if lookahead == 0 {
+ if d.tokens.n > 0 {
+ if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ return
+ }
+ }
+ if d.index < d.maxInsertIndex {
+ // Update the hash
+ d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+ ch := d.hashHead[d.hash]
+ d.chainHead = int(ch)
+ d.hashPrev[d.index&windowMask] = ch
+ d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+ }
+ d.length = minMatchLength - 1
+ d.offset = 0
+ minIndex := d.index - windowSize
+ if minIndex < 0 {
+ minIndex = 0
+ }
+
+ if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
+ if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+ d.length = newLength
+ d.offset = newOffset
+ }
+ }
+ if d.length >= minMatchLength {
+ d.ii = 0
+ // There was a match at the previous step, and the current match is
+ // not better. Output the previous match.
+ // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
+ d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
+ d.tokens.n++
+ // Insert in the hash table all strings up to the end of the match.
+ // index and index-1 are already inserted. If there is not enough
+ // lookahead, the last two strings are not inserted into the hash
+ // table.
+ if d.length <= d.fastSkipHashing {
+ var newIndex int
+ newIndex = d.index + d.length
+ // Calculate missing hashes
+ end := newIndex
+ if end > d.maxInsertIndex {
+ end = d.maxInsertIndex
+ }
+ end += minMatchLength - 1
+ startindex := d.index + 1
+ if startindex > d.maxInsertIndex {
+ startindex = d.maxInsertIndex
+ }
+ tocheck := d.window[startindex:end]
+ dstSize := len(tocheck) - minMatchLength + 1
+ if dstSize > 0 {
+ dst := d.hashMatch[:dstSize]
+
+ crc32sseAll(tocheck, dst)
+ var newH uint32
+ for i, val := range dst {
+ di := i + startindex
+ newH = val & hashMask
+ // Get previous value with the same hash.
+ // Our chain should point to the previous value.
+ d.hashPrev[di&windowMask] = d.hashHead[newH]
+ // Set the head of the hash chain to us.
+ d.hashHead[newH] = uint32(di + d.hashOffset)
+ }
+ d.hash = newH
+ }
+ d.index = newIndex
+ } else {
+ // For matches this long, we don't bother inserting each individual
+ // item into the table.
+ d.index += d.length
+ if d.index < d.maxInsertIndex {
+ d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+ }
+ }
+ if d.tokens.n == maxFlateBlockTokens {
+ // The block includes the current character
+ if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ } else {
+ d.ii++
+ end := d.index + int(d.ii>>5) + 1
+ if end > d.windowEnd {
+ end = d.windowEnd
+ }
+ for i := d.index; i < end; i++ {
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ }
+ d.index = end
+ }
+ }
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazySSE() {
+ // Sanity enables additional runtime tests.
+ // It's intended to be used during development
+ // to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+ return
+ }
+
+ d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+ if d.index < d.maxInsertIndex {
+ d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+ }
+
+ for {
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ lookahead := d.windowEnd - d.index
+ if lookahead < minMatchLength+maxMatchLength {
+ if !d.sync {
+ return
+ }
+ if sanity && d.index > d.windowEnd {
+ panic("index > windowEnd")
+ }
+ if lookahead == 0 {
+ // Flush current output block if any.
+ if d.byteAvailable {
+ // There is still one pending token that needs to be flushed
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ d.byteAvailable = false
+ }
+ if d.tokens.n > 0 {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ return
+ }
+ }
+ if d.index < d.maxInsertIndex {
+ // Update the hash
+ d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
+ ch := d.hashHead[d.hash]
+ d.chainHead = int(ch)
+ d.hashPrev[d.index&windowMask] = ch
+ d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+ }
+ prevLength := d.length
+ prevOffset := d.offset
+ d.length = minMatchLength - 1
+ d.offset = 0
+ minIndex := d.index - windowSize
+ if minIndex < 0 {
+ minIndex = 0
+ }
+
+ if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+ if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
+ d.length = newLength
+ d.offset = newOffset
+ }
+ }
+ if prevLength >= minMatchLength && d.length <= prevLength {
+ // There was a match at the previous step, and the current match is
+ // not better. Output the previous match.
+ d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+ d.tokens.n++
+
+ // Insert in the hash table all strings up to the end of the match.
+ // index and index-1 are already inserted. If there is not enough
+ // lookahead, the last two strings are not inserted into the hash
+ // table.
+ var newIndex int
+ newIndex = d.index + prevLength - 1
+ // Calculate missing hashes
+ end := newIndex
+ if end > d.maxInsertIndex {
+ end = d.maxInsertIndex
+ }
+ end += minMatchLength - 1
+ startindex := d.index + 1
+ if startindex > d.maxInsertIndex {
+ startindex = d.maxInsertIndex
+ }
+ tocheck := d.window[startindex:end]
+ dstSize := len(tocheck) - minMatchLength + 1
+ if dstSize > 0 {
+ dst := d.hashMatch[:dstSize]
+ crc32sseAll(tocheck, dst)
+ var newH uint32
+ for i, val := range dst {
+ di := i + startindex
+ newH = val & hashMask
+ // Get previous value with the same hash.
+ // Our chain should point to the previous value.
+ d.hashPrev[di&windowMask] = d.hashHead[newH]
+ // Set the head of the hash chain to us.
+ d.hashHead[newH] = uint32(di + d.hashOffset)
+ }
+ d.hash = newH
+ }
+
+ d.index = newIndex
+ d.byteAvailable = false
+ d.length = minMatchLength - 1
+ if d.tokens.n == maxFlateBlockTokens {
+ // The block includes the current character
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ } else {
+ // Reset, if we got a match this run.
+ if d.length >= minMatchLength {
+ d.ii = 0
+ }
+ // We have a byte waiting. Emit it.
+ if d.byteAvailable {
+ d.ii++
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ d.index++
+
+ // If we have a long run of no matches, skip additional bytes
+ // Resets when d.ii overflows after 64KB.
+ if d.ii > 31 {
+ n := int(d.ii >> 6)
+ for j := 0; j < n; j++ {
+ if d.index >= d.windowEnd-1 {
+ break
+ }
+
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ d.index++
+ }
+ // Flush last byte
+ d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
+ d.tokens.n++
+ d.byteAvailable = false
+ // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
+ if d.tokens.n == maxFlateBlockTokens {
+ if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+ return
+ }
+ d.tokens.n = 0
+ }
+ }
+ } else {
+ d.index++
+ d.byteAvailable = true
+ }
+ }
+ }
+}
+
+func (d *compressor) store() {
+ if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
+ d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+ d.windowEnd = 0
+ }
+}
+
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+ n := copy(d.window[d.windowEnd:], b)
+ d.windowEnd += n
+ return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+ if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+ return
+ }
+ d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+ d.err = d.w.err
+ d.windowEnd = 0
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeSnappy() {
+ // We only compress if we have maxStoreBlockSize.
+ if d.windowEnd < maxStoreBlockSize {
+ if !d.sync {
+ return
+ }
+ // Handle extremely small sizes.
+ if d.windowEnd < 128 {
+ if d.windowEnd == 0 {
+ return
+ }
+ if d.windowEnd <= 32 {
+ d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+ d.tokens.n = 0
+ d.windowEnd = 0
+ } else {
+ d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+ d.err = d.w.err
+ }
+ d.tokens.n = 0
+ d.windowEnd = 0
+ d.snap.Reset()
+ return
+ }
+ }
+
+ d.snap.Encode(&d.tokens, d.window[:d.windowEnd])
+ // If we made zero matches, store the block as is.
+ if int(d.tokens.n) == d.windowEnd {
+ d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+ // If we removed less than 1/16th, huffman compress the block.
+ } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+ d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+ d.err = d.w.err
+ } else {
+ d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd])
+ d.err = d.w.err
+ }
+ d.tokens.n = 0
+ d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
+func (d *compressor) write(b []byte) (n int, err error) {
+ if d.err != nil {
+ return 0, d.err
+ }
+ n = len(b)
+ for len(b) > 0 {
+ d.step(d)
+ b = b[d.fill(d, b):]
+ if d.err != nil {
+ return 0, d.err
+ }
+ }
+ return n, d.err
+}
+
+func (d *compressor) syncFlush() error {
+ d.sync = true
+ if d.err != nil {
+ return d.err
+ }
+ d.step(d)
+ if d.err == nil {
+ d.w.writeStoredHeader(0, false)
+ d.w.flush()
+ d.err = d.w.err
+ }
+ d.sync = false
+ return d.err
+}
+
+func (d *compressor) init(w io.Writer, level int) (err error) {
+ d.w = newHuffmanBitWriter(w)
+
+ switch {
+ case level == NoCompression:
+ d.window = make([]byte, maxStoreBlockSize)
+ d.fill = (*compressor).fillBlock
+ d.step = (*compressor).store
+ case level == ConstantCompression:
+ d.window = make([]byte, maxStoreBlockSize)
+ d.fill = (*compressor).fillBlock
+ d.step = (*compressor).storeHuff
+ case level >= 1 && level <= 4:
+ d.snap = newSnappy(level)
+ d.window = make([]byte, maxStoreBlockSize)
+ d.fill = (*compressor).fillBlock
+ d.step = (*compressor).storeSnappy
+ case level == DefaultCompression:
+ level = 5
+ fallthrough
+ case 5 <= level && level <= 9:
+ d.compressionLevel = levels[level]
+ d.initDeflate()
+ d.fill = (*compressor).fillDeflate
+ if d.fastSkipHashing == skipNever {
+ if useSSE42 {
+ d.step = (*compressor).deflateLazySSE
+ } else {
+ d.step = (*compressor).deflateLazy
+ }
+ } else {
+ if useSSE42 {
+ d.step = (*compressor).deflateSSE
+ } else {
+ d.step = (*compressor).deflate
+
+ }
+ }
+ default:
+ return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
+ }
+ return nil
+}
+
+// reset the state of the compressor.
+func (d *compressor) reset(w io.Writer) {
+ d.w.reset(w)
+ d.sync = false
+ d.err = nil
+ // We only need to reset a few things for Snappy.
+ if d.snap != nil {
+ d.snap.Reset()
+ d.windowEnd = 0
+ d.tokens.n = 0
+ return
+ }
+ switch d.compressionLevel.chain {
+ case 0:
+ // level was NoCompression or ConstantCompresssion.
+ d.windowEnd = 0
+ default:
+ d.chainHead = -1
+ for i := range d.hashHead {
+ d.hashHead[i] = 0
+ }
+ for i := range d.hashPrev {
+ d.hashPrev[i] = 0
+ }
+ d.hashOffset = 1
+ d.index, d.windowEnd = 0, 0
+ d.blockStart, d.byteAvailable = 0, false
+ d.tokens.n = 0
+ d.length = minMatchLength - 1
+ d.offset = 0
+ d.hash = 0
+ d.ii = 0
+ d.maxInsertIndex = 0
+ }
+}
+
+func (d *compressor) close() error {
+ if d.err != nil {
+ return d.err
+ }
+ d.sync = true
+ d.step(d)
+ if d.err != nil {
+ return d.err
+ }
+ if d.w.writeStoredHeader(0, true); d.w.err != nil {
+ return d.w.err
+ }
+ d.w.flush()
+ return d.w.err
+}
+
+// NewWriter returns a new Writer compressing data at the given level.
+// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
+// higher levels typically run slower but compress more.
+// Level 0 (NoCompression) does not attempt any compression; it only adds the
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (ConstantCompression) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
+//
+// If level is in the range [-2, 9] then the error returned will be nil.
+// Otherwise the error returned will be non-nil.
+func NewWriter(w io.Writer, level int) (*Writer, error) {
+ var dw Writer
+ if err := dw.d.init(w, level); err != nil {
+ return nil, err
+ }
+ return &dw, nil
+}
+
+// NewWriterDict is like NewWriter but initializes the new
+// Writer with a preset dictionary. The returned Writer behaves
+// as if the dictionary had been written to it without producing
+// any compressed output. The compressed data written to w
+// can only be decompressed by a Reader initialized with the
+// same dictionary.
+func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+ dw := &dictWriter{w}
+ zw, err := NewWriter(dw, level)
+ if err != nil {
+ return nil, err
+ }
+ zw.d.fillWindow(dict)
+ zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
+ return zw, err
+}
+
+type dictWriter struct {
+ w io.Writer
+}
+
+func (w *dictWriter) Write(b []byte) (n int, err error) {
+ return w.w.Write(b)
+}
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+ d compressor
+ dict []byte
+}
+
+// Write writes data to w, which will eventually write the
+// compressed form of data to its underlying writer.
+func (w *Writer) Write(data []byte) (n int, err error) {
+ return w.d.write(data)
+}
+
+// Flush flushes any pending data to the underlying writer.
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet.
+// Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
+// If the underlying writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (w *Writer) Flush() error {
+ // For more about flushing:
+ // http://www.bolet.org/~pornin/deflate-flush.html
+ return w.d.syncFlush()
+}
+
+// Close flushes and closes the writer.
+func (w *Writer) Close() error {
+ return w.d.close()
+}
+
+// Reset discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level and dictionary.
+func (w *Writer) Reset(dst io.Writer) {
+ if dw, ok := w.d.w.writer.(*dictWriter); ok {
+ // w was created with NewWriterDict
+ dw.w = dst
+ w.d.reset(dw)
+ w.d.fillWindow(w.dict)
+ } else {
+ // w was created with NewWriter
+ w.d.reset(dst)
+ }
+}
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+ w.dict = dict
+ w.d.reset(dst)
+ w.d.fillWindow(w.dict)
+}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
new file mode 100644
index 0000000000..71c75a065e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -0,0 +1,184 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
+// LZ77 decompresses data through sequences of two forms of commands:
+//
+// * Literal insertions: Runs of one or more symbols are inserted into the data
+// stream as is. This is accomplished through the writeByte method for a
+// single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+// Any valid stream must start with a literal insertion if no preset dictionary
+// is used.
+//
+// * Backward copies: Runs of one or more symbols are copied from previously
+// emitted data. Backward copies come as the tuple (dist, length) where dist
+// determines how far back in the stream to copy from and length determines how
+// many bytes to copy. Note that it is valid for the length to be greater than
+// the distance. Since LZ77 uses forward copies, that situation is used to
+// perform a form of run-length encoding on repeated runs of symbols.
+// The writeCopy and tryWriteCopy are used to implement this command.
+//
+// For performance reasons, this implementation performs little to no sanity
+// checks about the arguments. As such, the invariants documented for each
+// method call must be respected.
+type dictDecoder struct {
+ hist []byte // Sliding window history
+
+ // Invariant: 0 <= rdPos <= wrPos <= len(hist)
+ wrPos int // Current output position in buffer
+ rdPos int // Have emitted hist[:rdPos] already
+ full bool // Has a full window length been written yet?
+}
+
+// init initializes dictDecoder to have a sliding window dictionary of the given
+// size. If a preset dict is provided, it will initialize the dictionary with
+// the contents of dict.
+func (dd *dictDecoder) init(size int, dict []byte) {
+ *dd = dictDecoder{hist: dd.hist}
+
+ if cap(dd.hist) < size {
+ dd.hist = make([]byte, size)
+ }
+ dd.hist = dd.hist[:size]
+
+ if len(dict) > len(dd.hist) {
+ dict = dict[len(dict)-len(dd.hist):]
+ }
+ dd.wrPos = copy(dd.hist, dict)
+ if dd.wrPos == len(dd.hist) {
+ dd.wrPos = 0
+ dd.full = true
+ }
+ dd.rdPos = dd.wrPos
+}
+
+// histSize reports the total amount of historical data in the dictionary.
+func (dd *dictDecoder) histSize() int {
+ if dd.full {
+ return len(dd.hist)
+ }
+ return dd.wrPos
+}
+
+// availRead reports the number of bytes that can be flushed by readFlush.
+func (dd *dictDecoder) availRead() int {
+ return dd.wrPos - dd.rdPos
+}
+
+// availWrite reports the available amount of output buffer space.
+func (dd *dictDecoder) availWrite() int {
+ return len(dd.hist) - dd.wrPos
+}
+
+// writeSlice returns a slice of the available buffer to write data to.
+//
+// This invariant will be kept: len(s) <= availWrite()
+func (dd *dictDecoder) writeSlice() []byte {
+ return dd.hist[dd.wrPos:]
+}
+
+// writeMark advances the writer pointer by cnt.
+//
+// This invariant must be kept: 0 <= cnt <= availWrite()
+func (dd *dictDecoder) writeMark(cnt int) {
+ dd.wrPos += cnt
+}
+
+// writeByte writes a single byte to the dictionary.
+//
+// This invariant must be kept: 0 < availWrite()
+func (dd *dictDecoder) writeByte(c byte) {
+ dd.hist[dd.wrPos] = c
+ dd.wrPos++
+}
+
+// writeCopy copies a string at a given (dist, length) to the output.
+// This returns the number of bytes copied and may be less than the requested
+// length if the available space in the output buffer is too small.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) writeCopy(dist, length int) int {
+ dstBase := dd.wrPos
+ dstPos := dstBase
+ srcPos := dstPos - dist
+ endPos := dstPos + length
+ if endPos > len(dd.hist) {
+ endPos = len(dd.hist)
+ }
+
+ // Copy non-overlapping section after destination position.
+ //
+ // This section is non-overlapping in that the copy length for this section
+ // is always less than or equal to the backwards distance. This can occur
+ // if a distance refers to data that wraps-around in the buffer.
+ // Thus, a backwards copy is performed here; that is, the exact bytes in
+ // the source prior to the copy is placed in the destination.
+ if srcPos < 0 {
+ srcPos += len(dd.hist)
+ dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
+ srcPos = 0
+ }
+
+ // Copy possibly overlapping section before destination position.
+ //
+ // This section can overlap if the copy length for this section is larger
+ // than the backwards distance. This is allowed by LZ77 so that repeated
+ // strings can be succinctly represented using (dist, length) pairs.
+ // Thus, a forwards copy is performed here; that is, the bytes copied is
+ // possibly dependent on the resulting bytes in the destination as the copy
+ // progresses along. This is functionally equivalent to the following:
+ //
+ // for i := 0; i < endPos-dstPos; i++ {
+ // dd.hist[dstPos+i] = dd.hist[srcPos+i]
+ // }
+ // dstPos = endPos
+ //
+ for dstPos < endPos {
+ dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+ }
+
+ dd.wrPos = dstPos
+ return dstPos - dstBase
+}
+
+// tryWriteCopy tries to copy a string at a given (distance, length) to the
+// output. This specialized version is optimized for short distances.
+//
+// This method is designed to be inlined for performance reasons.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
+ dstPos := dd.wrPos
+ endPos := dstPos + length
+ if dstPos < dist || endPos > len(dd.hist) {
+ return 0
+ }
+ dstBase := dstPos
+ srcPos := dstPos - dist
+
+ // Copy possibly overlapping section before destination position.
+loop:
+ dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+ if dstPos < endPos {
+ goto loop // Avoid for-loop so that this function can be inlined
+ }
+
+ dd.wrPos = dstPos
+ return dstPos - dstBase
+}
+
+// readFlush returns a slice of the historical buffer that is ready to be
+// emitted to the user. The data returned by readFlush must be fully consumed
+// before calling any other dictDecoder methods.
+func (dd *dictDecoder) readFlush() []byte {
+ toRead := dd.hist[dd.rdPos:dd.wrPos]
+ dd.rdPos = dd.wrPos
+ if dd.wrPos == len(dd.hist) {
+ dd.wrPos, dd.rdPos = 0, 0
+ dd.full = true
+ }
+ return toRead
+}
diff --git a/vendor/github.com/klauspost/compress/flate/gen.go b/vendor/github.com/klauspost/compress/flate/gen.go
new file mode 100644
index 0000000000..154c89a488
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/gen.go
@@ -0,0 +1,265 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// This program generates fixedhuff.go
+// Invoke as
+//
+// go run gen.go -output fixedhuff.go
+
+package main
+
+import (
+ "bytes"
+ "flag"
+ "fmt"
+ "go/format"
+ "io/ioutil"
+ "log"
+)
+
+var filename = flag.String("output", "fixedhuff.go", "output file name")
+
+const maxCodeLen = 16
+
+// Note: the definition of the huffmanDecoder struct is copied from
+// inflate.go, as it is private to the implementation.
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+ huffmanChunkBits = 9
+ huffmanNumChunks = 1 << huffmanChunkBits
+ huffmanCountMask = 15
+ huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+ min int // the minimum code length
+ chunks [huffmanNumChunks]uint32 // chunks as described above
+ links [][]uint32 // overflow links
+ linkMask uint32 // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(bits []int) bool {
+ // Sanity enables additional runtime tests during Huffman
+ // table construction. It's intended to be used during
+ // development to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if h.min != 0 {
+ *h = huffmanDecoder{}
+ }
+
+ // Count number of codes of each length,
+ // compute min and max length.
+ var count [maxCodeLen]int
+ var min, max int
+ for _, n := range bits {
+ if n == 0 {
+ continue
+ }
+ if min == 0 || n < min {
+ min = n
+ }
+ if n > max {
+ max = n
+ }
+ count[n]++
+ }
+
+ // Empty tree. The decompressor.huffSym function will fail later if the tree
+ // is used. Technically, an empty tree is only valid for the HDIST tree and
+ // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+ // is guaranteed to fail since it will attempt to use the tree to decode the
+ // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+ // guaranteed to fail later since the compressed data section must be
+ // composed of at least one symbol (the end-of-block marker).
+ if max == 0 {
+ return true
+ }
+
+ code := 0
+ var nextcode [maxCodeLen]int
+ for i := min; i <= max; i++ {
+ code <<= 1
+ nextcode[i] = code
+ code += count[i]
+ }
+
+ // Check that the coding is complete (i.e., that we've
+ // assigned all 2-to-the-max possible bit sequences).
+ // Exception: To be compatible with zlib, we also need to
+ // accept degenerate single-code codings. See also
+ // TestDegenerateHuffmanCoding.
+ if code != 1<<uint(max) && !(code == 1 && max == 1) {
+ return false
+ }
+
+ h.min = min
+ if max > huffmanChunkBits {
+ numLinks := 1 << (uint(max) - huffmanChunkBits)
+ h.linkMask = uint32(numLinks - 1)
+
+ // create link tables
+ link := nextcode[huffmanChunkBits+1] >> 1
+ h.links = make([][]uint32, huffmanNumChunks-link)
+ for j := uint(link); j < huffmanNumChunks; j++ {
+ reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+ reverse >>= uint(16 - huffmanChunkBits)
+ off := j - uint(link)
+ if sanity && h.chunks[reverse] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+ h.links[off] = make([]uint32, numLinks)
+ }
+ }
+
+ for i, n := range bits {
+ if n == 0 {
+ continue
+ }
+ code := nextcode[n]
+ nextcode[n]++
+ chunk := uint32(i<<huffmanValueShift | n)
+ reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
+ reverse >>= uint(16 - n)
+ if n <= huffmanChunkBits {
+ for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+ // We should never need to overwrite
+ // an existing chunk. Also, 0 is
+ // never a valid chunk, because the
+ // lower 4 "count" bits should be
+ // between 1 and 15.
+ if sanity && h.chunks[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[off] = chunk
+ }
+ } else {
+ j := reverse & (huffmanNumChunks - 1)
+ if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+ // Longer codes should have been
+ // associated with a link table above.
+ panic("impossible: not an indirect chunk")
+ }
+ value := h.chunks[j] >> huffmanValueShift
+ linktab := h.links[value]
+ reverse >>= huffmanChunkBits
+ for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+ if sanity && linktab[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ linktab[off] = chunk
+ }
+ }
+ }
+
+ if sanity {
+ // Above we've sanity checked that we never overwrote
+ // an existing entry. Here we additionally check that
+ // we filled the tables completely.
+ for i, chunk := range h.chunks {
+ if chunk == 0 {
+ // As an exception, in the degenerate
+ // single-code case, we allow odd
+ // chunks to be missing.
+ if code == 1 && i%2 == 1 {
+ continue
+ }
+ panic("impossible: missing chunk")
+ }
+ }
+ for _, linktab := range h.links {
+ for _, chunk := range linktab {
+ if chunk == 0 {
+ panic("impossible: missing chunk")
+ }
+ }
+ }
+ }
+
+ return true
+}
+
+func main() {
+ flag.Parse()
+
+ var h huffmanDecoder
+ var bits [288]int
+ initReverseByte()
+ for i := 0; i < 144; i++ {
+ bits[i] = 8
+ }
+ for i := 144; i < 256; i++ {
+ bits[i] = 9
+ }
+ for i := 256; i < 280; i++ {
+ bits[i] = 7
+ }
+ for i := 280; i < 288; i++ {
+ bits[i] = 8
+ }
+ h.init(bits[:])
+ if h.links != nil {
+ log.Fatal("Unexpected links table in fixed Huffman decoder")
+ }
+
+ var buf bytes.Buffer
+
+ fmt.Fprintf(&buf, `// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.`+"\n\n")
+
+ fmt.Fprintln(&buf, "package flate")
+ fmt.Fprintln(&buf)
+ fmt.Fprintln(&buf, "// autogenerated by go run gen.go -output fixedhuff.go, DO NOT EDIT")
+ fmt.Fprintln(&buf)
+ fmt.Fprintln(&buf, "var fixedHuffmanDecoder = huffmanDecoder{")
+ fmt.Fprintf(&buf, "\t%d,\n", h.min)
+ fmt.Fprintln(&buf, "\t[huffmanNumChunks]uint32{")
+ for i := 0; i < huffmanNumChunks; i++ {
+ if i&7 == 0 {
+ fmt.Fprintf(&buf, "\t\t")
+ } else {
+ fmt.Fprintf(&buf, " ")
+ }
+ fmt.Fprintf(&buf, "0x%04x,", h.chunks[i])
+ if i&7 == 7 {
+ fmt.Fprintln(&buf)
+ }
+ }
+ fmt.Fprintln(&buf, "\t},")
+ fmt.Fprintln(&buf, "\tnil, 0,")
+ fmt.Fprintln(&buf, "}")
+
+ data, err := format.Source(buf.Bytes())
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = ioutil.WriteFile(*filename, data, 0644)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+var reverseByte [256]byte
+
+func initReverseByte() {
+ for x := 0; x < 256; x++ {
+ var result byte
+ for i := uint(0); i < 8; i++ {
+ result |= byte(((x >> i) & 1) << (7 - i))
+ }
+ reverseByte[x] = result
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
new file mode 100644
index 0000000000..f9b2a699a3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -0,0 +1,701 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+ "io"
+)
+
+const (
+ // The largest offset code.
+ offsetCodeCount = 30
+
+ // The special code used to mark the end of a block.
+ endBlockMarker = 256
+
+ // The first length code.
+ lengthCodesStart = 257
+
+ // The number of codegen codes.
+ codegenCodeCount = 19
+ badCode = 255
+
+ // bufferFlushSize indicates the buffer size
+ // after which bytes are flushed to the writer.
+ // Should preferably be a multiple of 6, since
+ // we accumulate 6 bytes between writes to the buffer.
+ bufferFlushSize = 240
+
+ // bufferSize is the actual output byte buffer size.
+ // It must have additional headroom for a flush
+ // which can contain up to 8 bytes.
+ bufferSize = bufferFlushSize + 8
+)
+
+// The number of extra bits needed by length code X - LENGTH_CODES_START.
+var lengthExtraBits = []int8{
+ /* 257 */ 0, 0, 0,
+ /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
+ /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
+ /* 280 */ 4, 5, 5, 5, 5, 0,
+}
+
+// The length indicated by length code X - LENGTH_CODES_START.
+var lengthBase = []uint32{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
+ 12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+ 64, 80, 96, 112, 128, 160, 192, 224, 255,
+}
+
+// offset code word extra bits.
+var offsetExtraBits = []int8{
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+ 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ /* extended window */
+ 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+}
+
+var offsetBase = []uint32{
+ /* normal deflate */
+ 0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+ 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+ 0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+ 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+ 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+ 0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+ /* extended window */
+ 0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
+ 0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
+ 0x100000, 0x180000, 0x200000, 0x300000,
+}
+
+// The odd order in which the codegen code sizes are written.
+var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+type huffmanBitWriter struct {
+ // writer is the underlying writer.
+ // Do not use it directly; use the write method, which ensures
+ // that Write errors are sticky.
+ writer io.Writer
+
+ // Data waiting to be written is bytes[0:nbytes]
+ // and then the low nbits of bits.
+ bits uint64
+ nbits uint
+ bytes [bufferSize]byte
+ codegenFreq [codegenCodeCount]int32
+ nbytes int
+ literalFreq []int32
+ offsetFreq []int32
+ codegen []uint8
+ literalEncoding *huffmanEncoder
+ offsetEncoding *huffmanEncoder
+ codegenEncoding *huffmanEncoder
+ err error
+}
+
+func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
+ return &huffmanBitWriter{
+ writer: w,
+ literalFreq: make([]int32, maxNumLit),
+ offsetFreq: make([]int32, offsetCodeCount),
+ codegen: make([]uint8, maxNumLit+offsetCodeCount+1),
+ literalEncoding: newHuffmanEncoder(maxNumLit),
+ codegenEncoding: newHuffmanEncoder(codegenCodeCount),
+ offsetEncoding: newHuffmanEncoder(offsetCodeCount),
+ }
+}
+
+func (w *huffmanBitWriter) reset(writer io.Writer) {
+ w.writer = writer
+ w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+ w.bytes = [bufferSize]byte{}
+}
+
+func (w *huffmanBitWriter) flush() {
+ if w.err != nil {
+ w.nbits = 0
+ return
+ }
+ n := w.nbytes
+ for w.nbits != 0 {
+ w.bytes[n] = byte(w.bits)
+ w.bits >>= 8
+ if w.nbits > 8 { // Avoid underflow
+ w.nbits -= 8
+ } else {
+ w.nbits = 0
+ }
+ n++
+ }
+ w.bits = 0
+ w.write(w.bytes[:n])
+ w.nbytes = 0
+}
+
+func (w *huffmanBitWriter) write(b []byte) {
+ if w.err != nil {
+ return
+ }
+ _, w.err = w.writer.Write(b)
+}
+
+func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
+ if w.err != nil {
+ return
+ }
+ w.bits |= uint64(b) << w.nbits
+ w.nbits += nb
+ if w.nbits >= 48 {
+ bits := w.bits
+ w.bits >>= 48
+ w.nbits -= 48
+ n := w.nbytes
+ bytes := w.bytes[n : n+6]
+ bytes[0] = byte(bits)
+ bytes[1] = byte(bits >> 8)
+ bytes[2] = byte(bits >> 16)
+ bytes[3] = byte(bits >> 24)
+ bytes[4] = byte(bits >> 32)
+ bytes[5] = byte(bits >> 40)
+ n += 6
+ if n >= bufferFlushSize {
+ w.write(w.bytes[:n])
+ n = 0
+ }
+ w.nbytes = n
+ }
+}
+
+func (w *huffmanBitWriter) writeBytes(bytes []byte) {
+ if w.err != nil {
+ return
+ }
+ n := w.nbytes
+ if w.nbits&7 != 0 {
+ w.err = InternalError("writeBytes with unfinished bits")
+ return
+ }
+ for w.nbits != 0 {
+ w.bytes[n] = byte(w.bits)
+ w.bits >>= 8
+ w.nbits -= 8
+ n++
+ }
+ if n != 0 {
+ w.write(w.bytes[:n])
+ }
+ w.nbytes = 0
+ w.write(bytes)
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and offset lengths arrays (which are concatenated into a single
+// array). This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegenFreq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code badCode is an end marker
+//
+// numLiterals The number of literals in literalEncoding
+// numOffsets The number of offsets in offsetEncoding
+// litenc, offenc The literal and offset encoder to use
+func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
+ for i := range w.codegenFreq {
+ w.codegenFreq[i] = 0
+ }
+ // Note that we are using codegen both as a temporary variable for holding
+ // a copy of the frequencies, and as the place where we put the result.
+ // This is fine because the output is always shorter than the input used
+ // so far.
+ codegen := w.codegen // cache
+ // Copy the concatenated code sizes to codegen. Put a marker at the end.
+ cgnl := codegen[:numLiterals]
+ for i := range cgnl {
+ cgnl[i] = uint8(litEnc.codes[i].len)
+ }
+
+ cgnl = codegen[numLiterals : numLiterals+numOffsets]
+ for i := range cgnl {
+ cgnl[i] = uint8(offEnc.codes[i].len)
+ }
+ codegen[numLiterals+numOffsets] = badCode
+
+ size := codegen[0]
+ count := 1
+ outIndex := 0
+ for inIndex := 1; size != badCode; inIndex++ {
+ // INVARIANT: We have seen "count" copies of size that have not yet
+ // had output generated for them.
+ nextSize := codegen[inIndex]
+ if nextSize == size {
+ count++
+ continue
+ }
+ // We need to generate codegen indicating "count" of size.
+ if size != 0 {
+ codegen[outIndex] = size
+ outIndex++
+ w.codegenFreq[size]++
+ count--
+ for count >= 3 {
+ n := 6
+ if n > count {
+ n = count
+ }
+ codegen[outIndex] = 16
+ outIndex++
+ codegen[outIndex] = uint8(n - 3)
+ outIndex++
+ w.codegenFreq[16]++
+ count -= n
+ }
+ } else {
+ for count >= 11 {
+ n := 138
+ if n > count {
+ n = count
+ }
+ codegen[outIndex] = 18
+ outIndex++
+ codegen[outIndex] = uint8(n - 11)
+ outIndex++
+ w.codegenFreq[18]++
+ count -= n
+ }
+ if count >= 3 {
+ // count >= 3 && count <= 10
+ codegen[outIndex] = 17
+ outIndex++
+ codegen[outIndex] = uint8(count - 3)
+ outIndex++
+ w.codegenFreq[17]++
+ count = 0
+ }
+ }
+ count--
+ for ; count >= 0; count-- {
+ codegen[outIndex] = size
+ outIndex++
+ w.codegenFreq[size]++
+ }
+ // Set up invariant for next time through the loop.
+ size = nextSize
+ count = 1
+ }
+ // Marker indicating the end of the codegen.
+ codegen[outIndex] = badCode
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+ numCodegens = len(w.codegenFreq)
+ for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+ numCodegens--
+ }
+ header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+ w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+ int(w.codegenFreq[16])*2 +
+ int(w.codegenFreq[17])*3 +
+ int(w.codegenFreq[18])*7
+ size = header +
+ litEnc.bitLength(w.literalFreq) +
+ offEnc.bitLength(w.offsetFreq) +
+ extraBits
+
+ return size, numCodegens
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+ return 3 +
+ fixedLiteralEncoding.bitLength(w.literalFreq) +
+ fixedOffsetEncoding.bitLength(w.offsetFreq) +
+ extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+ if in == nil {
+ return 0, false
+ }
+ if len(in) <= maxStoreBlockSize {
+ return (len(in) + 5) * 8, true
+ }
+ return 0, false
+}
+
+func (w *huffmanBitWriter) writeCode(c hcode) {
+ if w.err != nil {
+ return
+ }
+ w.bits |= uint64(c.code) << w.nbits
+ w.nbits += uint(c.len)
+ if w.nbits >= 48 {
+ bits := w.bits
+ w.bits >>= 48
+ w.nbits -= 48
+ n := w.nbytes
+ bytes := w.bytes[n : n+6]
+ bytes[0] = byte(bits)
+ bytes[1] = byte(bits >> 8)
+ bytes[2] = byte(bits >> 16)
+ bytes[3] = byte(bits >> 24)
+ bytes[4] = byte(bits >> 32)
+ bytes[5] = byte(bits >> 40)
+ n += 6
+ if n >= bufferFlushSize {
+ w.write(w.bytes[:n])
+ n = 0
+ }
+ w.nbytes = n
+ }
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+// numLiterals The number of literals specified in codegen
+// numOffsets The number of offsets specified in codegen
+// numCodegens The number of codegens used in codegen
+func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
+ if w.err != nil {
+ return
+ }
+ var firstBits int32 = 4
+ if isEof {
+ firstBits = 5
+ }
+ w.writeBits(firstBits, 3)
+ w.writeBits(int32(numLiterals-257), 5)
+ w.writeBits(int32(numOffsets-1), 5)
+ w.writeBits(int32(numCodegens-4), 4)
+
+ for i := 0; i < numCodegens; i++ {
+ value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+ w.writeBits(int32(value), 3)
+ }
+
+ i := 0
+ for {
+ var codeWord int = int(w.codegen[i])
+ i++
+ if codeWord == badCode {
+ break
+ }
+ w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
+
+ switch codeWord {
+ case 16:
+ w.writeBits(int32(w.codegen[i]), 2)
+ i++
+ break
+ case 17:
+ w.writeBits(int32(w.codegen[i]), 3)
+ i++
+ break
+ case 18:
+ w.writeBits(int32(w.codegen[i]), 7)
+ i++
+ break
+ }
+ }
+}
+
+func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
+ if w.err != nil {
+ return
+ }
+ var flag int32
+ if isEof {
+ flag = 1
+ }
+ w.writeBits(flag, 3)
+ w.flush()
+ w.writeBits(int32(length), 16)
+ w.writeBits(int32(^uint16(length)), 16)
+}
+
+func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
+ if w.err != nil {
+ return
+ }
+ // Indicate that we are a fixed Huffman block
+ var value int32 = 2
+ if isEof {
+ value = 3
+ }
+ w.writeBits(value, 3)
+}
+
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
+func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+ if w.err != nil {
+ return
+ }
+
+ tokens = append(tokens, endBlockMarker)
+ numLiterals, numOffsets := w.indexTokens(tokens)
+
+ var extraBits int
+ storedSize, storable := w.storedSize(input)
+ if storable {
+ // We only bother calculating the costs of the extra bits required by
+ // the length of offset fields (which will be the same for both fixed
+ // and dynamic encoding), if we need to compare those two encodings
+ // against stored encoding.
+ for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
+ // First eight length codes have extra size = 0.
+ extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
+ }
+ for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
+ // First four offset codes have extra size = 0.
+ extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
+ }
+ }
+
+ // Figure out smallest code.
+ // Fixed Huffman baseline.
+ var literalEncoding = fixedLiteralEncoding
+ var offsetEncoding = fixedOffsetEncoding
+ var size = w.fixedSize(extraBits)
+
+ // Dynamic Huffman?
+ var numCodegens int
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literalEncoding and the offsetEncoding.
+ w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+ w.codegenEncoding.generate(w.codegenFreq[:], 7)
+ dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+ if dynamicSize < size {
+ size = dynamicSize
+ literalEncoding = w.literalEncoding
+ offsetEncoding = w.offsetEncoding
+ }
+
+ // Stored bytes?
+ if storable && storedSize < size {
+ w.writeStoredHeader(len(input), eof)
+ w.writeBytes(input)
+ return
+ }
+
+ // Huffman.
+ if literalEncoding == fixedLiteralEncoding {
+ w.writeFixedHeader(eof)
+ } else {
+ w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+ }
+
+ // Write the tokens.
+ w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+ if w.err != nil {
+ return
+ }
+
+ tokens = append(tokens, endBlockMarker)
+ numLiterals, numOffsets := w.indexTokens(tokens)
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literalEncoding and the offsetEncoding.
+ w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+ w.codegenEncoding.generate(w.codegenFreq[:], 7)
+ size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+
+ // Store bytes, if we don't get a reasonable improvement.
+ if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+ w.writeStoredHeader(len(input), eof)
+ w.writeBytes(input)
+ return
+ }
+
+ // Write Huffman table.
+ w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+
+ // Write the tokens.
+ w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+}
+
+// indexTokens indexes a slice of tokens, and updates
+// literalFreq and offsetFreq, and generates literalEncoding
+// and offsetEncoding.
+// The number of literal and offset tokens is returned.
+func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
+ for i := range w.literalFreq {
+ w.literalFreq[i] = 0
+ }
+ for i := range w.offsetFreq {
+ w.offsetFreq[i] = 0
+ }
+
+ for _, t := range tokens {
+ if t < matchType {
+ w.literalFreq[t.literal()]++
+ continue
+ }
+ length := t.length()
+ offset := t.offset()
+ w.literalFreq[lengthCodesStart+lengthCode(length)]++
+ w.offsetFreq[offsetCode(offset)]++
+ }
+
+ // get the number of literals
+ numLiterals = len(w.literalFreq)
+ for w.literalFreq[numLiterals-1] == 0 {
+ numLiterals--
+ }
+ // get the number of offsets
+ numOffsets = len(w.offsetFreq)
+ for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
+ numOffsets--
+ }
+ if numOffsets == 0 {
+ // We haven't found a single match. If we want to go with the dynamic encoding,
+ // we should count at least one offset to be sure that the offset huffman tree could be encoded.
+ w.offsetFreq[0] = 1
+ numOffsets = 1
+ }
+ w.literalEncoding.generate(w.literalFreq, 15)
+ w.offsetEncoding.generate(w.offsetFreq, 15)
+ return
+}
+
+// writeTokens writes a slice of tokens to the output.
+// codes for literal and offset encoding must be supplied.
+func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
+ if w.err != nil {
+ return
+ }
+ for _, t := range tokens {
+ if t < matchType {
+ w.writeCode(leCodes[t.literal()])
+ continue
+ }
+ // Write the length
+ length := t.length()
+ lengthCode := lengthCode(length)
+ w.writeCode(leCodes[lengthCode+lengthCodesStart])
+ extraLengthBits := uint(lengthExtraBits[lengthCode])
+ if extraLengthBits > 0 {
+ extraLength := int32(length - lengthBase[lengthCode])
+ w.writeBits(extraLength, extraLengthBits)
+ }
+ // Write the offset
+ offset := t.offset()
+ offsetCode := offsetCode(offset)
+ w.writeCode(oeCodes[offsetCode])
+ extraOffsetBits := uint(offsetExtraBits[offsetCode])
+ if extraOffsetBits > 0 {
+ extraOffset := int32(offset - offsetBase[offsetCode])
+ w.writeBits(extraOffset, extraOffsetBits)
+ }
+ }
+}
+
+// huffOffset is a static offset encoder used for huffman only encoding.
+// It can be reused since we will not be encoding offset values.
+var huffOffset *huffmanEncoder
+
+func init() {
+ w := newHuffmanBitWriter(nil)
+ w.offsetFreq[0] = 1
+ huffOffset = newHuffmanEncoder(offsetCodeCount)
+ huffOffset.generate(w.offsetFreq, 15)
+}
+
+// writeBlockHuff encodes a block of bytes as either
+// Huffman encoded literals or uncompressed bytes if the
+// results only gains very little from compression.
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+ if w.err != nil {
+ return
+ }
+
+ // Clear histogram
+ for i := range w.literalFreq {
+ w.literalFreq[i] = 0
+ }
+
+ // Add everything as literals
+ histogram(input, w.literalFreq)
+
+ w.literalFreq[endBlockMarker] = 1
+
+ const numLiterals = endBlockMarker + 1
+ const numOffsets = 1
+
+ w.literalEncoding.generate(w.literalFreq, 15)
+
+ // Figure out smallest code.
+ // Always use dynamic Huffman or Store
+ var numCodegens int
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literalEncoding and the offsetEncoding.
+ w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+ w.codegenEncoding.generate(w.codegenFreq[:], 7)
+ size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+
+ // Store bytes, if we don't get a reasonable improvement.
+ if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+ w.writeStoredHeader(len(input), eof)
+ w.writeBytes(input)
+ return
+ }
+
+ // Huffman.
+ w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+ encoding := w.literalEncoding.codes[:257]
+ n := w.nbytes
+ for _, t := range input {
+ // Bitwriting inlined, ~30% speedup
+ c := encoding[t]
+ w.bits |= uint64(c.code) << w.nbits
+ w.nbits += uint(c.len)
+ if w.nbits < 48 {
+ continue
+ }
+ // Store 6 bytes
+ bits := w.bits
+ w.bits >>= 48
+ w.nbits -= 48
+ bytes := w.bytes[n : n+6]
+ bytes[0] = byte(bits)
+ bytes[1] = byte(bits >> 8)
+ bytes[2] = byte(bits >> 16)
+ bytes[3] = byte(bits >> 24)
+ bytes[4] = byte(bits >> 32)
+ bytes[5] = byte(bits >> 40)
+ n += 6
+ if n < bufferFlushSize {
+ continue
+ }
+ w.write(w.bytes[:n])
+ if w.err != nil {
+ return // Return early in the event of write failures
+ }
+ n = 0
+ }
+ w.nbytes = n
+ w.writeCode(encoding[endBlockMarker])
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
new file mode 100644
index 0000000000..bdcbd823b0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -0,0 +1,344 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+ "math"
+ "sort"
+)
+
+// hcode is a huffman code with a bit code and bit length.
+type hcode struct {
+ code, len uint16
+}
+
+type huffmanEncoder struct {
+ codes []hcode
+ freqcache []literalNode
+ bitCount [17]int32
+ lns byLiteral // stored to avoid repeated allocation in generate
+ lfs byFreq // stored to avoid repeated allocation in generate
+}
+
+type literalNode struct {
+ literal uint16
+ freq int32
+}
+
+// A levelInfo describes the state of the constructed tree for a given depth.
+type levelInfo struct {
+ // Our level. for better printing
+ level int32
+
+ // The frequency of the last node at this level
+ lastFreq int32
+
+ // The frequency of the next character to add to this level
+ nextCharFreq int32
+
+ // The frequency of the next pair (from level below) to add to this level.
+ // Only valid if the "needed" value of the next lower level is 0.
+ nextPairFreq int32
+
+ // The number of chains remaining to generate for this level before moving
+ // up to the next level
+ needed int32
+}
+
+// set sets the code and length of an hcode.
+func (h *hcode) set(code uint16, length uint16) {
+ h.len = length
+ h.code = code
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+
+func newHuffmanEncoder(size int) *huffmanEncoder {
+ return &huffmanEncoder{codes: make([]hcode, size)}
+}
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+func generateFixedLiteralEncoding() *huffmanEncoder {
+ h := newHuffmanEncoder(maxNumLit)
+ codes := h.codes
+ var ch uint16
+ for ch = 0; ch < maxNumLit; ch++ {
+ var bits uint16
+ var size uint16
+ switch {
+ case ch < 144:
+ // size 8, 000110000 .. 10111111
+ bits = ch + 48
+ size = 8
+ break
+ case ch < 256:
+ // size 9, 110010000 .. 111111111
+ bits = ch + 400 - 144
+ size = 9
+ break
+ case ch < 280:
+ // size 7, 0000000 .. 0010111
+ bits = ch - 256
+ size = 7
+ break
+ default:
+ // size 8, 11000000 .. 11000111
+ bits = ch + 192 - 280
+ size = 8
+ }
+ codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+ }
+ return h
+}
+
+func generateFixedOffsetEncoding() *huffmanEncoder {
+ h := newHuffmanEncoder(30)
+ codes := h.codes
+ for ch := range codes {
+ codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+ }
+ return h
+}
+
+var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
+var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+
+func (h *huffmanEncoder) bitLength(freq []int32) int {
+ var total int
+ for i, f := range freq {
+ if f != 0 {
+ total += int(f) * int(h.codes[i].len)
+ }
+ }
+ return total
+}
+
+const maxBitsLimit = 16
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
+// The cases of 0, 1, and 2 literals are handled by special case code.
+//
+// list An array of the literals with non-zero frequencies
+// and their associated frequencies. The array is in order of increasing
+// frequency, and has as its last element a special element with frequency
+// MaxInt32
+// maxBits The maximum number of bits that should be used to encode any literal.
+// Must be less than 16.
+// return An integer array in which array[i] indicates the number of literals
+// that should be encoded in i bits.
+func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
+ if maxBits >= maxBitsLimit {
+ panic("flate: maxBits too large")
+ }
+ n := int32(len(list))
+ list = list[0 : n+1]
+ list[n] = maxNode()
+
+ // The tree can't have greater depth than n - 1, no matter what. This
+ // saves a little bit of work in some small cases
+ if maxBits > n-1 {
+ maxBits = n - 1
+ }
+
+ // Create information about each of the levels.
+ // A bogus "Level 0" whose sole purpose is so that
+ // level1.prev.needed==0. This makes level1.nextPairFreq
+ // be a legitimate value that never gets chosen.
+ var levels [maxBitsLimit]levelInfo
+ // leafCounts[i] counts the number of literals at the left
+ // of ancestors of the rightmost node at level i.
+ // leafCounts[i][j] is the number of literals at the left
+ // of the level j ancestor.
+ var leafCounts [maxBitsLimit][maxBitsLimit]int32
+
+ for level := int32(1); level <= maxBits; level++ {
+ // For every level, the first two items are the first two characters.
+ // We initialize the levels as if we had already figured this out.
+ levels[level] = levelInfo{
+ level: level,
+ lastFreq: list[1].freq,
+ nextCharFreq: list[2].freq,
+ nextPairFreq: list[0].freq + list[1].freq,
+ }
+ leafCounts[level][level] = 2
+ if level == 1 {
+ levels[level].nextPairFreq = math.MaxInt32
+ }
+ }
+
+ // We need a total of 2*n - 2 items at top level and have already generated 2.
+ levels[maxBits].needed = 2*n - 4
+
+ level := maxBits
+ for {
+ l := &levels[level]
+ if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
+ // We've run out of both leafs and pairs.
+ // End all calculations for this level.
+ // To make sure we never come back to this level or any lower level,
+ // set nextPairFreq impossibly large.
+ l.needed = 0
+ levels[level+1].nextPairFreq = math.MaxInt32
+ level++
+ continue
+ }
+
+ prevFreq := l.lastFreq
+ if l.nextCharFreq < l.nextPairFreq {
+ // The next item on this row is a leaf node.
+ n := leafCounts[level][level] + 1
+ l.lastFreq = l.nextCharFreq
+ // Lower leafCounts are the same of the previous node.
+ leafCounts[level][level] = n
+ l.nextCharFreq = list[n].freq
+ } else {
+ // The next item on this row is a pair from the previous row.
+ // nextPairFreq isn't valid until we generate two
+ // more values in the level below
+ l.lastFreq = l.nextPairFreq
+ // Take leaf counts from the lower level, except counts[level] remains the same.
+ copy(leafCounts[level][:level], leafCounts[level-1][:level])
+ levels[l.level-1].needed = 2
+ }
+
+ if l.needed--; l.needed == 0 {
+ // We've done everything we need to do for this level.
+ // Continue calculating one level up. Fill in nextPairFreq
+ // of that level with the sum of the two nodes we've just calculated on
+ // this level.
+ if l.level == maxBits {
+ // All done!
+ break
+ }
+ levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
+ level++
+ } else {
+ // If we stole from below, move down temporarily to replenish it.
+ for levels[level-1].needed > 0 {
+ level--
+ }
+ }
+ }
+
+ // Somethings is wrong if at the end, the top level is null or hasn't used
+ // all of the leaves.
+ if leafCounts[maxBits][maxBits] != n {
+ panic("leafCounts[maxBits][maxBits] != n")
+ }
+
+ bitCount := h.bitCount[:maxBits+1]
+ bits := 1
+ counts := &leafCounts[maxBits]
+ for level := maxBits; level > 0; level-- {
+ // chain.leafCount gives the number of literals requiring at least "bits"
+ // bits to encode.
+ bitCount[bits] = counts[level] - counts[level-1]
+ bits++
+ }
+ return bitCount
+}
+
+// Look at the leaves and assign them a bit count and an encoding as specified
+// in RFC 1951 3.2.2
+func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
+ code := uint16(0)
+ for n, bits := range bitCount {
+ code <<= 1
+ if n == 0 || bits == 0 {
+ continue
+ }
+ // The literals list[len(list)-bits] .. list[len(list)-bits]
+ // are encoded using "bits" bits, and get the values
+ // code, code + 1, .... The code values are
+ // assigned in literal order (not frequency order).
+ chunk := list[len(list)-int(bits):]
+
+ h.lns.sort(chunk)
+ for _, node := range chunk {
+ h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+ code++
+ }
+ list = list[0 : len(list)-int(bits)]
+ }
+}
+
+// Update this Huffman Code object to be the minimum code for the specified frequency count.
+//
+// freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
+// maxBits The maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
+ if h.freqcache == nil {
+ // Allocate a reusable buffer with the longest possible frequency table.
+ // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
+ // The largest of these is maxNumLit, so we allocate for that case.
+ h.freqcache = make([]literalNode, maxNumLit+1)
+ }
+ list := h.freqcache[:len(freq)+1]
+ // Number of non-zero literals
+ count := 0
+ // Set list to be the set of all non-zero literals and their frequencies
+ for i, f := range freq {
+ if f != 0 {
+ list[count] = literalNode{uint16(i), f}
+ count++
+ } else {
+ list[count] = literalNode{}
+ h.codes[i].len = 0
+ }
+ }
+ list[len(freq)] = literalNode{}
+
+ list = list[:count]
+ if count <= 2 {
+ // Handle the small cases here, because they are awkward for the general case code. With
+ // two or fewer literals, everything has bit length 1.
+ for i, node := range list {
+ // "list" is in order of increasing literal value.
+ h.codes[node.literal].set(uint16(i), 1)
+ }
+ return
+ }
+ h.lfs.sort(list)
+
+ // Get the number of literals for each bit count
+ bitCount := h.bitCounts(list, maxBits)
+ // And do the assignment
+ h.assignEncodingAndSize(bitCount, list)
+}
+
+type byLiteral []literalNode
+
+func (s *byLiteral) sort(a []literalNode) {
+ *s = byLiteral(a)
+ sort.Sort(s)
+}
+
+func (s byLiteral) Len() int { return len(s) }
+
+func (s byLiteral) Less(i, j int) bool {
+ return s[i].literal < s[j].literal
+}
+
+func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+type byFreq []literalNode
+
+func (s *byFreq) sort(a []literalNode) {
+ *s = byFreq(a)
+ sort.Sort(s)
+}
+
+func (s byFreq) Len() int { return len(s) }
+
+func (s byFreq) Less(i, j int) bool {
+ if s[i].freq == s[j].freq {
+ return s[i].literal < s[j].literal
+ }
+ return s[i].freq < s[j].freq
+}
+
+func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
new file mode 100644
index 0000000000..53b63d9a0b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -0,0 +1,846 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951. The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+package flate
+
+import (
+ "bufio"
+ "io"
+ "strconv"
+ "sync"
+)
+
+const (
+ maxCodeLen = 16 // max length of Huffman code
+ // The next three numbers come from the RFC section 3.2.7, with the
+ // additional proviso in section 3.2.5 which implies that distance codes
+ // 30 and 31 should never occur in compressed data.
+ maxNumLit = 286
+ maxNumDist = 30
+ numCodes = 19 // number of codes in Huffman meta-code
+)
+
+// Initialize the fixedHuffmanDecoder only once upon first use.
+var fixedOnce sync.Once
+var fixedHuffmanDecoder huffmanDecoder
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError int64
+
+func (e CorruptInputError) Error() string {
+ return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
+}
+
+// An InternalError reports an error in the flate code itself.
+type InternalError string
+
+func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
+
+// A ReadError reports an error encountered while reading input.
+//
+// Deprecated: No longer returned.
+type ReadError struct {
+ Offset int64 // byte offset where error occurred
+ Err error // error returned by underlying Read
+}
+
+func (e *ReadError) Error() string {
+ return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// A WriteError reports an error encountered while writing output.
+//
+// Deprecated: No longer returned.
+type WriteError struct {
+ Offset int64 // byte offset where error occurred
+ Err error // error returned by underlying Write
+}
+
+func (e *WriteError) Error() string {
+ return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+ // Reset discards any buffered data and resets the Resetter as if it was
+ // newly initialized with the given reader.
+ Reset(r io.Reader, dict []byte) error
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+// http://www.gzip.org/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+ huffmanChunkBits = 9
+ huffmanNumChunks = 1 << huffmanChunkBits
+ huffmanCountMask = 15
+ huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+ min int // the minimum code length
+ chunks [huffmanNumChunks]uint32 // chunks as described above
+ links [][]uint32 // overflow links
+ linkMask uint32 // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(bits []int) bool {
+ // Sanity enables additional runtime tests during Huffman
+ // table construction. It's intended to be used during
+ // development to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if h.min != 0 {
+ *h = huffmanDecoder{}
+ }
+
+ // Count number of codes of each length,
+ // compute min and max length.
+ var count [maxCodeLen]int
+ var min, max int
+ for _, n := range bits {
+ if n == 0 {
+ continue
+ }
+ if min == 0 || n < min {
+ min = n
+ }
+ if n > max {
+ max = n
+ }
+ count[n]++
+ }
+
+ // Empty tree. The decompressor.huffSym function will fail later if the tree
+ // is used. Technically, an empty tree is only valid for the HDIST tree and
+ // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+ // is guaranteed to fail since it will attempt to use the tree to decode the
+ // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+ // guaranteed to fail later since the compressed data section must be
+ // composed of at least one symbol (the end-of-block marker).
+ if max == 0 {
+ return true
+ }
+
+ code := 0
+ var nextcode [maxCodeLen]int
+ for i := min; i <= max; i++ {
+ code <<= 1
+ nextcode[i] = code
+ code += count[i]
+ }
+
+ // Check that the coding is complete (i.e., that we've
+ // assigned all 2-to-the-max possible bit sequences).
+ // Exception: To be compatible with zlib, we also need to
+ // accept degenerate single-code codings. See also
+ // TestDegenerateHuffmanCoding.
+ if code != 1<<uint(max) && !(code == 1 && max == 1) {
+ return false
+ }
+
+ h.min = min
+ if max > huffmanChunkBits {
+ numLinks := 1 << (uint(max) - huffmanChunkBits)
+ h.linkMask = uint32(numLinks - 1)
+
+ // create link tables
+ link := nextcode[huffmanChunkBits+1] >> 1
+ h.links = make([][]uint32, huffmanNumChunks-link)
+ for j := uint(link); j < huffmanNumChunks; j++ {
+ reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+ reverse >>= uint(16 - huffmanChunkBits)
+ off := j - uint(link)
+ if sanity && h.chunks[reverse] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+ h.links[off] = make([]uint32, numLinks)
+ }
+ }
+
+ for i, n := range bits {
+ if n == 0 {
+ continue
+ }
+ code := nextcode[n]
+ nextcode[n]++
+ chunk := uint32(i<<huffmanValueShift | n)
+ reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
+ reverse >>= uint(16 - n)
+ if n <= huffmanChunkBits {
+ for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+ // We should never need to overwrite
+ // an existing chunk. Also, 0 is
+ // never a valid chunk, because the
+ // lower 4 "count" bits should be
+ // between 1 and 15.
+ if sanity && h.chunks[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[off] = chunk
+ }
+ } else {
+ j := reverse & (huffmanNumChunks - 1)
+ if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+ // Longer codes should have been
+ // associated with a link table above.
+ panic("impossible: not an indirect chunk")
+ }
+ value := h.chunks[j] >> huffmanValueShift
+ linktab := h.links[value]
+ reverse >>= huffmanChunkBits
+ for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+ if sanity && linktab[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ linktab[off] = chunk
+ }
+ }
+ }
+
+ if sanity {
+ // Above we've sanity checked that we never overwrote
+ // an existing entry. Here we additionally check that
+ // we filled the tables completely.
+ for i, chunk := range h.chunks {
+ if chunk == 0 {
+ // As an exception, in the degenerate
+ // single-code case, we allow odd
+ // chunks to be missing.
+ if code == 1 && i%2 == 1 {
+ continue
+ }
+ panic("impossible: missing chunk")
+ }
+ }
+ for _, linktab := range h.links {
+ for _, chunk := range linktab {
+ if chunk == 0 {
+ panic("impossible: missing chunk")
+ }
+ }
+ }
+ }
+
+ return true
+}
+
+// The actual read interface needed by NewReader.
+// If the passed in io.Reader does not also have ReadByte,
+// the NewReader will introduce its own buffering.
+type Reader interface {
+ io.Reader
+ io.ByteReader
+}
+
+// Decompress state.
+type decompressor struct {
+ // Input source.
+ r Reader
+ roffset int64
+
+ // Input bits, in top of b.
+ b uint32
+ nb uint
+
+ // Huffman decoders for literal/length, distance.
+ h1, h2 huffmanDecoder
+
+ // Length arrays used to define Huffman codes.
+ bits *[maxNumLit + maxNumDist]int
+ codebits *[numCodes]int
+
+ // Output history, buffer.
+ dict dictDecoder
+
+ // Temporary buffer (avoids repeated allocation).
+ buf [4]byte
+
+ // Next step in the decompression,
+ // and decompression state.
+ step func(*decompressor)
+ stepState int
+ final bool
+ err error
+ toRead []byte
+ hl, hd *huffmanDecoder
+ copyLen int
+ copyDist int
+}
+
+func (f *decompressor) nextBlock() {
+ for f.nb < 1+2 {
+ if f.err = f.moreBits(); f.err != nil {
+ return
+ }
+ }
+ f.final = f.b&1 == 1
+ f.b >>= 1
+ typ := f.b & 3
+ f.b >>= 2
+ f.nb -= 1 + 2
+ switch typ {
+ case 0:
+ f.dataBlock()
+ case 1:
+ // compressed, fixed Huffman tables
+ f.hl = &fixedHuffmanDecoder
+ f.hd = nil
+ f.huffmanBlock()
+ case 2:
+ // compressed, dynamic Huffman tables
+ if f.err = f.readHuffman(); f.err != nil {
+ break
+ }
+ f.hl = &f.h1
+ f.hd = &f.h2
+ f.huffmanBlock()
+ default:
+ // 3 is reserved.
+ f.err = CorruptInputError(f.roffset)
+ }
+}
+
+func (f *decompressor) Read(b []byte) (int, error) {
+ for {
+ if len(f.toRead) > 0 {
+ n := copy(b, f.toRead)
+ f.toRead = f.toRead[n:]
+ if len(f.toRead) == 0 {
+ return n, f.err
+ }
+ return n, nil
+ }
+ if f.err != nil {
+ return 0, f.err
+ }
+ f.step(f)
+ if f.err != nil && len(f.toRead) == 0 {
+ f.toRead = f.dict.readFlush() // Flush what's left in case of error
+ }
+ }
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
+ total := int64(0)
+ flushed := false
+ for {
+ if len(f.toRead) > 0 {
+ n, err := w.Write(f.toRead)
+ total += int64(n)
+ if err != nil {
+ f.err = err
+ return total, err
+ }
+ if n != len(f.toRead) {
+ return total, io.ErrShortWrite
+ }
+ f.toRead = f.toRead[:0]
+ }
+ if f.err != nil && flushed {
+ if f.err == io.EOF {
+ return total, nil
+ }
+ return total, f.err
+ }
+ if f.err == nil {
+ f.step(f)
+ }
+ if len(f.toRead) == 0 && f.err != nil && !flushed {
+ f.toRead = f.dict.readFlush() // Flush what's left in case of error
+ flushed = true
+ }
+ }
+}
+
+func (f *decompressor) Close() error {
+ if f.err == io.EOF {
+ return nil
+ }
+ return f.err
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+func (f *decompressor) readHuffman() error {
+ // HLIT[5], HDIST[5], HCLEN[4].
+ for f.nb < 5+5+4 {
+ if err := f.moreBits(); err != nil {
+ return err
+ }
+ }
+ nlit := int(f.b&0x1F) + 257
+ if nlit > maxNumLit {
+ return CorruptInputError(f.roffset)
+ }
+ f.b >>= 5
+ ndist := int(f.b&0x1F) + 1
+ if ndist > maxNumDist {
+ return CorruptInputError(f.roffset)
+ }
+ f.b >>= 5
+ nclen := int(f.b&0xF) + 4
+ // numCodes is 19, so nclen is always valid.
+ f.b >>= 4
+ f.nb -= 5 + 5 + 4
+
+ // (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
+ for i := 0; i < nclen; i++ {
+ for f.nb < 3 {
+ if err := f.moreBits(); err != nil {
+ return err
+ }
+ }
+ f.codebits[codeOrder[i]] = int(f.b & 0x7)
+ f.b >>= 3
+ f.nb -= 3
+ }
+ for i := nclen; i < len(codeOrder); i++ {
+ f.codebits[codeOrder[i]] = 0
+ }
+ if !f.h1.init(f.codebits[0:]) {
+ return CorruptInputError(f.roffset)
+ }
+
+ // HLIT + 257 code lengths, HDIST + 1 code lengths,
+ // using the code length Huffman code.
+ for i, n := 0, nlit+ndist; i < n; {
+ x, err := f.huffSym(&f.h1)
+ if err != nil {
+ return err
+ }
+ if x < 16 {
+ // Actual length.
+ f.bits[i] = x
+ i++
+ continue
+ }
+ // Repeat previous length or zero.
+ var rep int
+ var nb uint
+ var b int
+ switch x {
+ default:
+ return InternalError("unexpected length code")
+ case 16:
+ rep = 3
+ nb = 2
+ if i == 0 {
+ return CorruptInputError(f.roffset)
+ }
+ b = f.bits[i-1]
+ case 17:
+ rep = 3
+ nb = 3
+ b = 0
+ case 18:
+ rep = 11
+ nb = 7
+ b = 0
+ }
+ for f.nb < nb {
+ if err := f.moreBits(); err != nil {
+ return err
+ }
+ }
+ rep += int(f.b & uint32(1<<nb-1))
+ f.b >>= nb
+ f.nb -= nb
+ if i+rep > n {
+ return CorruptInputError(f.roffset)
+ }
+ for j := 0; j < rep; j++ {
+ f.bits[i] = b
+ i++
+ }
+ }
+
+ if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+ return CorruptInputError(f.roffset)
+ }
+
+ // As an optimization, we can initialize the min bits to read at a time
+ // for the HLIT tree to the length of the EOB marker since we know that
+ // every block must terminate with one. This preserves the property that
+ // we never read any extra bytes after the end of the DEFLATE stream.
+ if f.h1.min < f.bits[endBlockMarker] {
+ f.h1.min = f.bits[endBlockMarker]
+ }
+
+ return nil
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBlock() {
+ const (
+ stateInit = iota // Zero value must be stateInit
+ stateDict
+ )
+
+ switch f.stepState {
+ case stateInit:
+ goto readLiteral
+ case stateDict:
+ goto copyHistory
+ }
+
+readLiteral:
+ // Read literal and/or (length, distance) according to RFC section 3.2.3.
+ {
+ v, err := f.huffSym(f.hl)
+ if err != nil {
+ f.err = err
+ return
+ }
+ var n uint // number of bits extra
+ var length int
+ switch {
+ case v < 256:
+ f.dict.writeByte(byte(v))
+ if f.dict.availWrite() == 0 {
+ f.toRead = f.dict.readFlush()
+ f.step = (*decompressor).huffmanBlock
+ f.stepState = stateInit
+ return
+ }
+ goto readLiteral
+ case v == 256:
+ f.finishBlock()
+ return
+ // otherwise, reference to older data
+ case v < 265:
+ length = v - (257 - 3)
+ n = 0
+ case v < 269:
+ length = v*2 - (265*2 - 11)
+ n = 1
+ case v < 273:
+ length = v*4 - (269*4 - 19)
+ n = 2
+ case v < 277:
+ length = v*8 - (273*8 - 35)
+ n = 3
+ case v < 281:
+ length = v*16 - (277*16 - 67)
+ n = 4
+ case v < 285:
+ length = v*32 - (281*32 - 131)
+ n = 5
+ case v < maxNumLit:
+ length = 258
+ n = 0
+ default:
+ f.err = CorruptInputError(f.roffset)
+ return
+ }
+ if n > 0 {
+ for f.nb < n {
+ if err = f.moreBits(); err != nil {
+ f.err = err
+ return
+ }
+ }
+ length += int(f.b & uint32(1<<n-1))
+ f.b >>= n
+ f.nb -= n
+ }
+
+ var dist int
+ if f.hd == nil {
+ for f.nb < 5 {
+ if err = f.moreBits(); err != nil {
+ f.err = err
+ return
+ }
+ }
+ dist = int(reverseByte[(f.b&0x1F)<<3])
+ f.b >>= 5
+ f.nb -= 5
+ } else {
+ if dist, err = f.huffSym(f.hd); err != nil {
+ f.err = err
+ return
+ }
+ }
+
+ switch {
+ case dist < 4:
+ dist++
+ case dist < maxNumDist:
+ nb := uint(dist-2) >> 1
+ // have 1 bit in bottom of dist, need nb more.
+ extra := (dist & 1) << nb
+ for f.nb < nb {
+ if err = f.moreBits(); err != nil {
+ f.err = err
+ return
+ }
+ }
+ extra |= int(f.b & uint32(1<<nb-1))
+ f.b >>= nb
+ f.nb -= nb
+ dist = 1<<(nb+1) + 1 + extra
+ default:
+ f.err = CorruptInputError(f.roffset)
+ return
+ }
+
+ // No check on length; encoding can be prescient.
+ if dist > f.dict.histSize() {
+ f.err = CorruptInputError(f.roffset)
+ return
+ }
+
+ f.copyLen, f.copyDist = length, dist
+ goto copyHistory
+ }
+
+copyHistory:
+ // Perform a backwards copy according to RFC section 3.2.3.
+ {
+ cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+ if cnt == 0 {
+ cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+ }
+ f.copyLen -= cnt
+
+ if f.dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = f.dict.readFlush()
+ f.step = (*decompressor).huffmanBlock // We need to continue this work
+ f.stepState = stateDict
+ return
+ }
+ goto readLiteral
+ }
+}
+
+// Copy a single uncompressed data block from input to output.
+func (f *decompressor) dataBlock() {
+ // Uncompressed.
+ // Discard current half-byte.
+ f.nb = 0
+ f.b = 0
+
+ // Length then ones-complement of length.
+ nr, err := io.ReadFull(f.r, f.buf[0:4])
+ f.roffset += int64(nr)
+ if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ f.err = err
+ return
+ }
+ n := int(f.buf[0]) | int(f.buf[1])<<8
+ nn := int(f.buf[2]) | int(f.buf[3])<<8
+ if uint16(nn) != uint16(^n) {
+ f.err = CorruptInputError(f.roffset)
+ return
+ }
+
+ if n == 0 {
+ f.toRead = f.dict.readFlush()
+ f.finishBlock()
+ return
+ }
+
+ f.copyLen = n
+ f.copyData()
+}
+
+// copyData copies f.copyLen bytes from the underlying reader into f.hist.
+// It pauses for reads when f.hist is full.
+func (f *decompressor) copyData() {
+ buf := f.dict.writeSlice()
+ if len(buf) > f.copyLen {
+ buf = buf[:f.copyLen]
+ }
+
+ cnt, err := io.ReadFull(f.r, buf)
+ f.roffset += int64(cnt)
+ f.copyLen -= cnt
+ f.dict.writeMark(cnt)
+ if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ f.err = err
+ return
+ }
+
+ if f.dict.availWrite() == 0 || f.copyLen > 0 {
+ f.toRead = f.dict.readFlush()
+ f.step = (*decompressor).copyData
+ return
+ }
+ f.finishBlock()
+}
+
+func (f *decompressor) finishBlock() {
+ if f.final {
+ if f.dict.availRead() > 0 {
+ f.toRead = f.dict.readFlush()
+ }
+ f.err = io.EOF
+ }
+ f.step = (*decompressor).nextBlock
+}
+
+func (f *decompressor) moreBits() error {
+ c, err := f.r.ReadByte()
+ if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ return err
+ }
+ f.roffset++
+ f.b |= uint32(c) << f.nb
+ f.nb += 8
+ return nil
+}
+
+// Read the next Huffman-encoded symbol from f according to h.
+func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+ // Since a huffmanDecoder can be empty or be composed of a degenerate tree
+ // with single element, huffSym must error on these two edge cases. In both
+ // cases, the chunks slice will be 0 for the invalid sequence, leading it
+ // satisfy the n == 0 check below.
+ n := uint(h.min)
+ for {
+ for f.nb < n {
+ if err := f.moreBits(); err != nil {
+ return 0, err
+ }
+ }
+ chunk := h.chunks[f.b&(huffmanNumChunks-1)]
+ n = uint(chunk & huffmanCountMask)
+ if n > huffmanChunkBits {
+ chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
+ n = uint(chunk & huffmanCountMask)
+ }
+ if n <= f.nb {
+ if n == 0 {
+ f.err = CorruptInputError(f.roffset)
+ return 0, f.err
+ }
+ f.b >>= n
+ f.nb -= n
+ return int(chunk >> huffmanValueShift), nil
+ }
+ }
+}
+
+func makeReader(r io.Reader) Reader {
+ if rr, ok := r.(Reader); ok {
+ return rr
+ }
+ return bufio.NewReader(r)
+}
+
+func fixedHuffmanDecoderInit() {
+ fixedOnce.Do(func() {
+ // These come from the RFC section 3.2.6.
+ var bits [288]int
+ for i := 0; i < 144; i++ {
+ bits[i] = 8
+ }
+ for i := 144; i < 256; i++ {
+ bits[i] = 9
+ }
+ for i := 256; i < 280; i++ {
+ bits[i] = 7
+ }
+ for i := 280; i < 288; i++ {
+ bits[i] = 8
+ }
+ fixedHuffmanDecoder.init(bits[:])
+ })
+}
+
+func (f *decompressor) Reset(r io.Reader, dict []byte) error {
+ *f = decompressor{
+ r: makeReader(r),
+ bits: f.bits,
+ codebits: f.codebits,
+ dict: f.dict,
+ step: (*decompressor).nextBlock,
+ }
+ f.dict.init(maxMatchOffset, dict)
+ return nil
+}
+
+// NewReader returns a new ReadCloser that can be used
+// to read the uncompressed version of r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser
+// when finished reading.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReader(r io.Reader) io.ReadCloser {
+ fixedHuffmanDecoderInit()
+
+ var f decompressor
+ f.r = makeReader(r)
+ f.bits = new([maxNumLit + maxNumDist]int)
+ f.codebits = new([numCodes]int)
+ f.step = (*decompressor).nextBlock
+ f.dict.init(maxMatchOffset, nil)
+ return &f
+}
+
+// NewReaderDict is like NewReader but initializes the reader
+// with a preset dictionary. The returned Reader behaves as if
+// the uncompressed data stream started with the given dictionary,
+// which has already been read. NewReaderDict is typically used
+// to read data compressed by NewWriterDict.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+ fixedHuffmanDecoderInit()
+
+ var f decompressor
+ f.r = makeReader(r)
+ f.bits = new([maxNumLit + maxNumDist]int)
+ f.codebits = new([numCodes]int)
+ f.step = (*decompressor).nextBlock
+ f.dict.init(maxMatchOffset, dict)
+ return &f
+}
diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
new file mode 100644
index 0000000000..c1a02720d1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
@@ -0,0 +1,48 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+var reverseByte = [256]byte{
+ 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+ 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+ 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+ 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+ 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+ 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+ 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+ 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+ 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+ 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+ 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+ 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+ 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+ 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+ 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+ 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+ 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+ 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+ 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+ 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+ 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+ 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+ 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+ 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+ 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+ 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+ 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+ 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+ 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+ 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+ 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+ 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+}
+
+func reverseUint16(v uint16) uint16 {
+ return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+ return reverseUint16(number << uint8(16-bitLength))
+}
diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go
new file mode 100644
index 0000000000..0bbd946c01
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/snappy.go
@@ -0,0 +1,856 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+ ol := int(dst.n)
+ for i, v := range lit {
+ dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+ }
+ dst.n += uint16(len(lit))
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+func emitCopy(dst *tokens, offset, length int) {
+ dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
+ dst.n++
+}
+
+type snappyEnc interface {
+ Encode(dst *tokens, src []byte)
+ Reset()
+}
+
+func newSnappy(level int) snappyEnc {
+ switch level {
+ case 1:
+ return &snappyL1{}
+ case 2:
+ return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
+ case 3:
+ return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
+ case 4:
+ return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
+ default:
+ panic("invalid level specified")
+ }
+}
+
+const (
+ tableBits = 14 // Bits used in the table
+ tableSize = 1 << tableBits // Size of the table
+ tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+ tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+ baseMatchOffset = 1 // The smallest match offset
+ baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5
+ maxMatchOffset = 1 << 15 // The largest match offset
+)
+
+func load32(b []byte, i int) uint32 {
+ b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+ b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+ uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u uint32) uint32 {
+ return (u * 0x1e35a7bd) >> tableShift
+}
+
+// snappyL1 encapsulates level 1 compression
+type snappyL1 struct{}
+
+func (e *snappyL1) Reset() {}
+
+func (e *snappyL1) Encode(dst *tokens, src []byte) {
+ const (
+ inputMargin = 16 - 1
+ minNonLiteralBlockSize = 1 + 1 + inputMargin
+ )
+
+ // This check isn't in the Snappy implementation, but there, the caller
+ // instead of the callee handles this case.
+ if len(src) < minNonLiteralBlockSize {
+ // We do not fill the token table.
+ // This will be picked up by caller.
+ dst.n = uint16(len(src))
+ return
+ }
+
+ // Initialize the hash table.
+ //
+ // The table element type is uint16, as s < sLimit and sLimit < len(src)
+ // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
+ var table [tableSize]uint16
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ nextHash := hash(load32(src, s))
+
+ for {
+ // Copied from the C++ snappy implementation:
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since
+ // the last match; dividing it by 32 (ie. right-shifting by five) gives
+ // the number of bytes to move ahead for each iteration.
+ skip := 32
+
+ nextS := s
+ candidate := 0
+ for {
+ s = nextS
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ candidate = int(table[nextHash&tableMask])
+ table[nextHash&tableMask] = uint16(s)
+ nextHash = hash(load32(src, nextS))
+ // TODO: < should be <=, and add a test for that.
+ if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
+ break
+ }
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+ emitLiteral(dst, src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+
+ // Extend the 4-byte match as long as possible.
+ //
+ // This is an inlined version of Snappy's:
+ // s = extendMatch(src, candidate+4, s+4)
+ s += 4
+ s1 := base + maxMatchLength
+ if s1 > len(src) {
+ s1 = len(src)
+ }
+ a := src[s:s1]
+ b := src[candidate+4:]
+ b = b[:len(a)]
+ l := len(a)
+ for i := range a {
+ if a[i] != b[i] {
+ l = i
+ break
+ }
+ }
+ s += l
+
+ // matchToken is flate's equivalent of Snappy's emitCopy.
+ dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
+ dst.n++
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ // We could immediately start working at s now, but to improve
+ // compression we first update the hash table at s-1 and at s. If
+ // another emitCopy is not our next move, also calculate nextHash
+ // at s+1. At least on GOARCH=amd64, these three hash calculations
+ // are faster as one load64 call (with some shifts) instead of
+ // three load32 calls.
+ x := load64(src, s-1)
+ prevHash := hash(uint32(x >> 0))
+ table[prevHash&tableMask] = uint16(s - 1)
+ currHash := hash(uint32(x >> 8))
+ candidate = int(table[currHash&tableMask])
+ table[currHash&tableMask] = uint16(s)
+ // TODO: >= should be >, and add a test for that.
+ if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
+ nextHash = hash(uint32(x >> 16))
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ emitLiteral(dst, src[nextEmit:])
+ }
+}
+
+type tableEntry struct {
+ val uint32
+ offset int32
+}
+
+func load3232(b []byte, i int32) uint32 {
+ b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+ b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+ uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// snappyGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type snappyGen struct {
+ prev []byte
+ cur int32
+}
+
+// snappyGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type snappyL2 struct {
+ snappyGen
+ table [tableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *snappyL2) Encode(dst *tokens, src []byte) {
+ const (
+ inputMargin = 16 - 1
+ minNonLiteralBlockSize = 1 + 1 + inputMargin
+ )
+
+ // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
+ if e.cur > 1<<30 {
+ for i := range e.table {
+ e.table[i] = tableEntry{}
+ }
+ e.cur = maxStoreBlockSize
+ }
+
+ // This check isn't in the Snappy implementation, but there, the caller
+ // instead of the callee handles this case.
+ if len(src) < minNonLiteralBlockSize {
+ // We do not fill the token table.
+ // This will be picked up by caller.
+ dst.n = uint16(len(src))
+ e.cur += maxStoreBlockSize
+ e.prev = e.prev[:0]
+ return
+ }
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := int32(len(src) - inputMargin)
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := int32(0)
+ s := int32(0)
+ cv := load3232(src, s)
+ nextHash := hash(cv)
+
+ for {
+ // Copied from the C++ snappy implementation:
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since
+ // the last match; dividing it by 32 (ie. right-shifting by five) gives
+ // the number of bytes to move ahead for each iteration.
+ skip := int32(32)
+
+ nextS := s
+ var candidate tableEntry
+ for {
+ s = nextS
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ candidate = e.table[nextHash&tableMask]
+ now := load3232(src, nextS)
+ e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
+ nextHash = hash(now)
+
+ offset := s - (candidate.offset - e.cur)
+ if offset >= maxMatchOffset || cv != candidate.val {
+ // Out of range or not matched.
+ cv = now
+ continue
+ }
+ break
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+ emitLiteral(dst, src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+
+ // Extend the 4-byte match as long as possible.
+ //
+ s += 4
+ t := candidate.offset - e.cur + 4
+ l := e.matchlen(s, t, src)
+
+ // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+ dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+ dst.n++
+ s += l
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ // We could immediately start working at s now, but to improve
+ // compression we first update the hash table at s-1 and at s. If
+ // another emitCopy is not our next move, also calculate nextHash
+ // at s+1. At least on GOARCH=amd64, these three hash calculations
+ // are faster as one load64 call (with some shifts) instead of
+ // three load32 calls.
+ x := load6432(src, s-1)
+ prevHash := hash(uint32(x))
+ e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
+ x >>= 8
+ currHash := hash(uint32(x))
+ candidate = e.table[currHash&tableMask]
+ e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
+
+ offset := s - (candidate.offset - e.cur)
+ if offset >= maxMatchOffset || uint32(x) != candidate.val {
+ cv = uint32(x >> 8)
+ nextHash = hash(cv)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if int(nextEmit) < len(src) {
+ emitLiteral(dst, src[nextEmit:])
+ }
+ e.cur += int32(len(src))
+ e.prev = e.prev[:len(src)]
+ copy(e.prev, src)
+}
+
+type tableEntryPrev struct {
+ Cur tableEntry
+ Prev tableEntry
+}
+
+// snappyL3
+type snappyL3 struct {
+ snappyGen
+ table [tableSize]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *snappyL3) Encode(dst *tokens, src []byte) {
+ const (
+ inputMargin = 16 - 1
+ minNonLiteralBlockSize = 1 + 1 + inputMargin
+ )
+
+ // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
+ if e.cur > 1<<30 {
+ for i := range e.table {
+ e.table[i] = tableEntryPrev{}
+ }
+ e.cur = maxStoreBlockSize
+ }
+
+ // This check isn't in the Snappy implementation, but there, the caller
+ // instead of the callee handles this case.
+ if len(src) < minNonLiteralBlockSize {
+ // We do not fill the token table.
+ // This will be picked up by caller.
+ dst.n = uint16(len(src))
+ e.cur += maxStoreBlockSize
+ e.prev = e.prev[:0]
+ return
+ }
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := int32(len(src) - inputMargin)
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := int32(0)
+ s := int32(0)
+ cv := load3232(src, s)
+ nextHash := hash(cv)
+
+ for {
+ // Copied from the C++ snappy implementation:
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since
+ // the last match; dividing it by 32 (ie. right-shifting by five) gives
+ // the number of bytes to move ahead for each iteration.
+ skip := int32(32)
+
+ nextS := s
+ var candidate tableEntry
+ for {
+ s = nextS
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ candidates := e.table[nextHash&tableMask]
+ now := load3232(src, nextS)
+ e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+ nextHash = hash(now)
+
+ // Check both candidates
+ candidate = candidates.Cur
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ break
+ }
+ } else {
+ // We only check if value mismatches.
+ // Offset will always be invalid in other cases.
+ candidate = candidates.Prev
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ break
+ }
+ }
+ }
+ cv = now
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+ emitLiteral(dst, src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+
+ // Extend the 4-byte match as long as possible.
+ //
+ s += 4
+ t := candidate.offset - e.cur + 4
+ l := e.matchlen(s, t, src)
+
+ // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+ dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+ dst.n++
+ s += l
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ // We could immediately start working at s now, but to improve
+ // compression we first update the hash table at s-2, s-1 and at s. If
+ // another emitCopy is not our next move, also calculate nextHash
+ // at s+1. At least on GOARCH=amd64, these three hash calculations
+ // are faster as one load64 call (with some shifts) instead of
+ // three load32 calls.
+ x := load6432(src, s-2)
+ prevHash := hash(uint32(x))
+
+ e.table[prevHash&tableMask] = tableEntryPrev{
+ Prev: e.table[prevHash&tableMask].Cur,
+ Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+ }
+ x >>= 8
+ prevHash = hash(uint32(x))
+
+ e.table[prevHash&tableMask] = tableEntryPrev{
+ Prev: e.table[prevHash&tableMask].Cur,
+ Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+ }
+ x >>= 8
+ currHash := hash(uint32(x))
+ candidates := e.table[currHash&tableMask]
+ cv = uint32(x)
+ e.table[currHash&tableMask] = tableEntryPrev{
+ Prev: candidates.Cur,
+ Cur: tableEntry{offset: s + e.cur, val: cv},
+ }
+
+ // Check both candidates
+ candidate = candidates.Cur
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ continue
+ }
+ } else {
+ // We only check if value mismatches.
+ // Offset will always be invalid in other cases.
+ candidate = candidates.Prev
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ continue
+ }
+ }
+ }
+ cv = uint32(x >> 8)
+ nextHash = hash(cv)
+ s++
+ break
+ }
+ }
+
+emitRemainder:
+ if int(nextEmit) < len(src) {
+ emitLiteral(dst, src[nextEmit:])
+ }
+ e.cur += int32(len(src))
+ e.prev = e.prev[:len(src)]
+ copy(e.prev, src)
+}
+
+// snappyL4
+type snappyL4 struct {
+ snappyL3
+}
+
+// Encode uses a similar algorithm to level 3,
+// but will check up to two candidates if first isn't long enough.
+func (e *snappyL4) Encode(dst *tokens, src []byte) {
+ const (
+ inputMargin = 16 - 1
+ minNonLiteralBlockSize = 1 + 1 + inputMargin
+ matchLenGood = 12
+ )
+
+ // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
+ if e.cur > 1<<30 {
+ for i := range e.table {
+ e.table[i] = tableEntryPrev{}
+ }
+ e.cur = maxStoreBlockSize
+ }
+
+ // This check isn't in the Snappy implementation, but there, the caller
+ // instead of the callee handles this case.
+ if len(src) < minNonLiteralBlockSize {
+ // We do not fill the token table.
+ // This will be picked up by caller.
+ dst.n = uint16(len(src))
+ e.cur += maxStoreBlockSize
+ e.prev = e.prev[:0]
+ return
+ }
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := int32(len(src) - inputMargin)
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := int32(0)
+ s := int32(0)
+ cv := load3232(src, s)
+ nextHash := hash(cv)
+
+ for {
+ // Copied from the C++ snappy implementation:
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since
+ // the last match; dividing it by 32 (ie. right-shifting by five) gives
+ // the number of bytes to move ahead for each iteration.
+ skip := int32(32)
+
+ nextS := s
+ var candidate tableEntry
+ var candidateAlt tableEntry
+ for {
+ s = nextS
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ candidates := e.table[nextHash&tableMask]
+ now := load3232(src, nextS)
+ e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+ nextHash = hash(now)
+
+ // Check both candidates
+ candidate = candidates.Cur
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ offset = s - (candidates.Prev.offset - e.cur)
+ if cv == candidates.Prev.val && offset < maxMatchOffset {
+ candidateAlt = candidates.Prev
+ }
+ break
+ }
+ } else {
+ // We only check if value mismatches.
+ // Offset will always be invalid in other cases.
+ candidate = candidates.Prev
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ break
+ }
+ }
+ }
+ cv = now
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+ emitLiteral(dst, src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+
+ // Extend the 4-byte match as long as possible.
+ //
+ s += 4
+ t := candidate.offset - e.cur + 4
+ l := e.matchlen(s, t, src)
+ // Try alternative candidate if match length < matchLenGood.
+ if l < matchLenGood-4 && candidateAlt.offset != 0 {
+ t2 := candidateAlt.offset - e.cur + 4
+ l2 := e.matchlen(s, t2, src)
+ if l2 > l {
+ l = l2
+ t = t2
+ }
+ }
+ // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+ dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
+ dst.n++
+ s += l
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ // We could immediately start working at s now, but to improve
+ // compression we first update the hash table at s-2, s-1 and at s. If
+ // another emitCopy is not our next move, also calculate nextHash
+ // at s+1. At least on GOARCH=amd64, these three hash calculations
+ // are faster as one load64 call (with some shifts) instead of
+ // three load32 calls.
+ x := load6432(src, s-2)
+ prevHash := hash(uint32(x))
+
+ e.table[prevHash&tableMask] = tableEntryPrev{
+ Prev: e.table[prevHash&tableMask].Cur,
+ Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+ }
+ x >>= 8
+ prevHash = hash(uint32(x))
+
+ e.table[prevHash&tableMask] = tableEntryPrev{
+ Prev: e.table[prevHash&tableMask].Cur,
+ Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+ }
+ x >>= 8
+ currHash := hash(uint32(x))
+ candidates := e.table[currHash&tableMask]
+ cv = uint32(x)
+ e.table[currHash&tableMask] = tableEntryPrev{
+ Prev: candidates.Cur,
+ Cur: tableEntry{offset: s + e.cur, val: cv},
+ }
+
+ // Check both candidates
+ candidate = candidates.Cur
+ candidateAlt = tableEntry{}
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ offset = s - (candidates.Prev.offset - e.cur)
+ if cv == candidates.Prev.val && offset < maxMatchOffset {
+ candidateAlt = candidates.Prev
+ }
+ continue
+ }
+ } else {
+ // We only check if value mismatches.
+ // Offset will always be invalid in other cases.
+ candidate = candidates.Prev
+ if cv == candidate.val {
+ offset := s - (candidate.offset - e.cur)
+ if offset < maxMatchOffset {
+ continue
+ }
+ }
+ }
+ cv = uint32(x >> 8)
+ nextHash = hash(cv)
+ s++
+ break
+ }
+ }
+
+emitRemainder:
+ if int(nextEmit) < len(src) {
+ emitLiteral(dst, src[nextEmit:])
+ }
+ e.cur += int32(len(src))
+ e.prev = e.prev[:len(src)]
+ copy(e.prev, src)
+}
+
+func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
+ s1 := int(s) + maxMatchLength - 4
+ if s1 > len(src) {
+ s1 = len(src)
+ }
+
+ // If we are inside the current block
+ if t >= 0 {
+ b := src[t:]
+ a := src[s:s1]
+ b = b[:len(a)]
+ // Extend the match to be as long as possible.
+ for i := range a {
+ if a[i] != b[i] {
+ return int32(i)
+ }
+ }
+ return int32(len(a))
+ }
+
+ // We found a match in the previous block.
+ tp := int32(len(e.prev)) + t
+ if tp < 0 {
+ return 0
+ }
+
+ // Extend the match to be as long as possible.
+ a := src[s:s1]
+ b := e.prev[tp:]
+ if len(b) > len(a) {
+ b = b[:len(a)]
+ }
+ a = a[:len(b)]
+ for i := range b {
+ if a[i] != b[i] {
+ return int32(i)
+ }
+ }
+ n := int32(len(b))
+ a = src[s+n : s1]
+ b = src[:len(a)]
+ for i := range a {
+ if a[i] != b[i] {
+ return int32(i) + n
+ }
+ }
+ return int32(len(a)) + n
+}
+
+// Reset the encoding table.
+func (e *snappyGen) Reset() {
+ e.prev = e.prev[:0]
+ e.cur += maxMatchOffset + 1
+}
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
new file mode 100644
index 0000000000..4f275ea61d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -0,0 +1,115 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import "fmt"
+
+const (
+ // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused
+ // 8 bits: xlength = length - MIN_MATCH_LENGTH
+ // 22 bits xoffset = offset - MIN_OFFSET_SIZE, or literal
+ lengthShift = 22
+ offsetMask = 1<<lengthShift - 1
+ typeMask = 3 << 30
+ literalType = 0 << 30
+ matchType = 1 << 30
+)
+
+// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
+// is lengthCodes[length - MIN_MATCH_LENGTH]
+var lengthCodes = [...]uint32{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+ 9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+ 13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+ 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+ 17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+ 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+ 19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 28,
+}
+
+var offsetCodes = [...]uint32{
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+}
+
+type token uint32
+
+type tokens struct {
+ tokens [maxStoreBlockSize + 1]token
+ n uint16 // Must be able to contain maxStoreBlockSize
+}
+
+// Convert a literal into a literal token.
+func literalToken(literal uint32) token { return token(literalType + literal) }
+
+// Convert a < xlength, xoffset > pair into a match token.
+func matchToken(xlength uint32, xoffset uint32) token {
+ return token(matchType + xlength<<lengthShift + xoffset)
+}
+
+func matchTokend(xlength uint32, xoffset uint32) token {
+ if xlength > maxMatchLength || xoffset > maxMatchOffset {
+ panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
+ return token(matchType)
+ }
+ return token(matchType + xlength<<lengthShift + xoffset)
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint32 { return uint32(t - literalType) }
+
+// Returns the extra offset of a match token
+func (t token) offset() uint32 { return uint32(t) & offsetMask }
+
+func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+
+func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+
+// Returns the offset code corresponding to a specific offset
+func offsetCode(off uint32) uint32 {
+ if off < uint32(len(offsetCodes)) {
+ return offsetCodes[off]
+ } else if off>>7 < uint32(len(offsetCodes)) {
+ return offsetCodes[off>>7] + 14
+ } else {
+ return offsetCodes[off>>14] + 28
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
new file mode 100644
index 0000000000..e73fab3f0f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@@ -0,0 +1,344 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gzip implements reading and writing of gzip format compressed files,
+// as specified in RFC 1952.
+package gzip
+
+import (
+ "bufio"
+ "encoding/binary"
+ "errors"
+ "io"
+ "time"
+
+ "github.com/klauspost/compress/flate"
+ "github.com/klauspost/crc32"
+)
+
+const (
+ gzipID1 = 0x1f
+ gzipID2 = 0x8b
+ gzipDeflate = 8
+ flagText = 1 << 0
+ flagHdrCrc = 1 << 1
+ flagExtra = 1 << 2
+ flagName = 1 << 3
+ flagComment = 1 << 4
+)
+
+var (
+ // ErrChecksum is returned when reading GZIP data that has an invalid checksum.
+ ErrChecksum = errors.New("gzip: invalid checksum")
+ // ErrHeader is returned when reading GZIP data that has an invalid header.
+ ErrHeader = errors.New("gzip: invalid header")
+)
+
+var le = binary.LittleEndian
+
+// noEOF converts io.EOF to io.ErrUnexpectedEOF.
+func noEOF(err error) error {
+ if err == io.EOF {
+ return io.ErrUnexpectedEOF
+ }
+ return err
+}
+
+// The gzip file stores a header giving metadata about the compressed file.
+// That header is exposed as the fields of the Writer and Reader structs.
+//
+// Strings must be UTF-8 encoded and may only contain Unicode code points
+// U+0001 through U+00FF, due to limitations of the GZIP file format.
+type Header struct {
+ Comment string // comment
+ Extra []byte // "extra data"
+ ModTime time.Time // modification time
+ Name string // file name
+ OS byte // operating system type
+}
+
+// A Reader is an io.Reader that can be read to retrieve
+// uncompressed data from a gzip-format compressed file.
+//
+// In general, a gzip file can be a concatenation of gzip files,
+// each with its own header. Reads from the Reader
+// return the concatenation of the uncompressed data of each.
+// Only the first header is recorded in the Reader fields.
+//
+// Gzip files store a length and checksum of the uncompressed data.
+// The Reader will return a ErrChecksum when Read
+// reaches the end of the uncompressed data if it does not
+// have the expected length or checksum. Clients should treat data
+// returned by Read as tentative until they receive the io.EOF
+// marking the end of the data.
+type Reader struct {
+ Header // valid after NewReader or Reader.Reset
+ r flate.Reader
+ decompressor io.ReadCloser
+ digest uint32 // CRC-32, IEEE polynomial (section 8)
+ size uint32 // Uncompressed size (section 2.3.1)
+ buf [512]byte
+ err error
+ multistream bool
+}
+
+// NewReader creates a new Reader reading the given reader.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+//
+// It is the caller's responsibility to call Close on the Reader when done.
+//
+// The Reader.Header fields will be valid in the Reader returned.
+func NewReader(r io.Reader) (*Reader, error) {
+ z := new(Reader)
+ if err := z.Reset(r); err != nil {
+ return nil, err
+ }
+ return z, nil
+}
+
+// Reset discards the Reader z's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *Reader) Reset(r io.Reader) error {
+ *z = Reader{
+ decompressor: z.decompressor,
+ multistream: true,
+ }
+ if rr, ok := r.(flate.Reader); ok {
+ z.r = rr
+ } else {
+ z.r = bufio.NewReader(r)
+ }
+ z.Header, z.err = z.readHeader()
+ return z.err
+}
+
+// Multistream controls whether the reader supports multistream files.
+//
+// If enabled (the default), the Reader expects the input to be a sequence
+// of individually gzipped data streams, each with its own header and
+// trailer, ending at EOF. The effect is that the concatenation of a sequence
+// of gzipped files is treated as equivalent to the gzip of the concatenation
+// of the sequence. This is standard behavior for gzip readers.
+//
+// Calling Multistream(false) disables this behavior; disabling the behavior
+// can be useful when reading file formats that distinguish individual gzip
+// data streams or mix gzip data streams with other data streams.
+// In this mode, when the Reader reaches the end of the data stream,
+// Read returns io.EOF. If the underlying reader implements io.ByteReader,
+// it will be left positioned just after the gzip stream.
+// To start the next stream, call z.Reset(r) followed by z.Multistream(false).
+// If there is no next stream, z.Reset(r) will return io.EOF.
+func (z *Reader) Multistream(ok bool) {
+ z.multistream = ok
+}
+
+// readString reads a NUL-terminated string from z.r.
+// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
+// will output a string encoded using UTF-8.
+// This method always updates z.digest with the data read.
+func (z *Reader) readString() (string, error) {
+ var err error
+ needConv := false
+ for i := 0; ; i++ {
+ if i >= len(z.buf) {
+ return "", ErrHeader
+ }
+ z.buf[i], err = z.r.ReadByte()
+ if err != nil {
+ return "", err
+ }
+ if z.buf[i] > 0x7f {
+ needConv = true
+ }
+ if z.buf[i] == 0 {
+ // Digest covers the NUL terminator.
+ z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
+
+ // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
+ if needConv {
+ s := make([]rune, 0, i)
+ for _, v := range z.buf[:i] {
+ s = append(s, rune(v))
+ }
+ return string(s), nil
+ }
+ return string(z.buf[:i]), nil
+ }
+ }
+}
+
+// readHeader reads the GZIP header according to section 2.3.1.
+// This method does not set z.err.
+func (z *Reader) readHeader() (hdr Header, err error) {
+ if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
+ // RFC 1952, section 2.2, says the following:
+ // A gzip file consists of a series of "members" (compressed data sets).
+ //
+ // Other than this, the specification does not clarify whether a
+ // "series" is defined as "one or more" or "zero or more". To err on the
+ // side of caution, Go interprets this to mean "zero or more".
+ // Thus, it is okay to return io.EOF here.
+ return hdr, err
+ }
+ if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
+ return hdr, ErrHeader
+ }
+ flg := z.buf[3]
+ hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
+ // z.buf[8] is XFL and is currently ignored.
+ hdr.OS = z.buf[9]
+ z.digest = crc32.ChecksumIEEE(z.buf[:10])
+
+ if flg&flagExtra != 0 {
+ if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+ return hdr, noEOF(err)
+ }
+ z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
+ data := make([]byte, le.Uint16(z.buf[:2]))
+ if _, err = io.ReadFull(z.r, data); err != nil {
+ return hdr, noEOF(err)
+ }
+ z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
+ hdr.Extra = data
+ }
+
+ var s string
+ if flg&flagName != 0 {
+ if s, err = z.readString(); err != nil {
+ return hdr, err
+ }
+ hdr.Name = s
+ }
+
+ if flg&flagComment != 0 {
+ if s, err = z.readString(); err != nil {
+ return hdr, err
+ }
+ hdr.Comment = s
+ }
+
+ if flg&flagHdrCrc != 0 {
+ if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+ return hdr, noEOF(err)
+ }
+ digest := le.Uint16(z.buf[:2])
+ if digest != uint16(z.digest) {
+ return hdr, ErrHeader
+ }
+ }
+
+ z.digest = 0
+ if z.decompressor == nil {
+ z.decompressor = flate.NewReader(z.r)
+ } else {
+ z.decompressor.(flate.Resetter).Reset(z.r, nil)
+ }
+ return hdr, nil
+}
+
+// Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
+func (z *Reader) Read(p []byte) (n int, err error) {
+ if z.err != nil {
+ return 0, z.err
+ }
+
+ n, z.err = z.decompressor.Read(p)
+ z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
+ z.size += uint32(n)
+ if z.err != io.EOF {
+ // In the normal case we return here.
+ return n, z.err
+ }
+
+ // Finished file; check checksum and size.
+ if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
+ z.err = noEOF(err)
+ return n, z.err
+ }
+ digest := le.Uint32(z.buf[:4])
+ size := le.Uint32(z.buf[4:8])
+ if digest != z.digest || size != z.size {
+ z.err = ErrChecksum
+ return n, z.err
+ }
+ z.digest, z.size = 0, 0
+
+ // File is ok; check if there is another.
+ if !z.multistream {
+ return n, io.EOF
+ }
+ z.err = nil // Remove io.EOF
+
+ if _, z.err = z.readHeader(); z.err != nil {
+ return n, z.err
+ }
+
+ // Read from next file, if necessary.
+ if n > 0 {
+ return n, nil
+ }
+ return z.Read(p)
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (z *Reader) WriteTo(w io.Writer) (int64, error) {
+ total := int64(0)
+ crcWriter := crc32.NewIEEE()
+ for {
+ if z.err != nil {
+ if z.err == io.EOF {
+ return total, nil
+ }
+ return total, z.err
+ }
+
+ // We write both to output and digest.
+ mw := io.MultiWriter(w, crcWriter)
+ n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
+ total += n
+ z.size += uint32(n)
+ if err != nil {
+ z.err = err
+ return total, z.err
+ }
+
+ // Finished file; check checksum + size.
+ if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ z.err = err
+ return total, err
+ }
+ z.digest = crcWriter.Sum32()
+ digest := le.Uint32(z.buf[:4])
+ size := le.Uint32(z.buf[4:8])
+ if digest != z.digest || size != z.size {
+ z.err = ErrChecksum
+ return total, z.err
+ }
+ z.digest, z.size = 0, 0
+
+ // File is ok; check if there is another.
+ if !z.multistream {
+ return total, nil
+ }
+ crcWriter.Reset()
+ z.err = nil // Remove io.EOF
+
+ if _, z.err = z.readHeader(); z.err != nil {
+ if z.err == io.EOF {
+ return total, nil
+ }
+ return total, z.err
+ }
+ }
+}
+
+// Close closes the Reader. It does not close the underlying io.Reader.
+// In order for the GZIP checksum to be verified, the reader must be
+// fully consumed until the io.EOF.
+func (z *Reader) Close() error { return z.decompressor.Close() }
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
new file mode 100644
index 0000000000..a0f3ed0fcf
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@@ -0,0 +1,251 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gzip
+
+import (
+ "errors"
+ "fmt"
+ "io"
+
+ "github.com/klauspost/compress/flate"
+ "github.com/klauspost/crc32"
+)
+
+// These constants are copied from the flate package, so that code that imports
+// "compress/gzip" does not also have to import "compress/flate".
+const (
+ NoCompression = flate.NoCompression
+ BestSpeed = flate.BestSpeed
+ BestCompression = flate.BestCompression
+ DefaultCompression = flate.DefaultCompression
+ ConstantCompression = flate.ConstantCompression
+ HuffmanOnly = flate.HuffmanOnly
+)
+
+// A Writer is an io.WriteCloser.
+// Writes to a Writer are compressed and written to w.
+type Writer struct {
+ Header // written at first call to Write, Flush, or Close
+ w io.Writer
+ level int
+ wroteHeader bool
+ compressor *flate.Writer
+ digest uint32 // CRC-32, IEEE polynomial (section 8)
+ size uint32 // Uncompressed size (section 2.3.1)
+ closed bool
+ buf [10]byte
+ err error
+}
+
+// NewWriter returns a new Writer.
+// Writes to the returned writer are compressed and written to w.
+//
+// It is the caller's responsibility to call Close on the WriteCloser when done.
+// Writes may be buffered and not flushed until Close.
+//
+// Callers that wish to set the fields in Writer.Header must do so before
+// the first call to Write, Flush, or Close.
+func NewWriter(w io.Writer) *Writer {
+ z, _ := NewWriterLevel(w, DefaultCompression)
+ return z
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming DefaultCompression.
+//
+// The compression level can be DefaultCompression, NoCompression, or any
+// integer value between BestSpeed and BestCompression inclusive. The error
+// returned will be nil if the level is valid.
+func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
+ if level < HuffmanOnly || level > BestCompression {
+ return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
+ }
+ z := new(Writer)
+ z.init(w, level)
+ return z, nil
+}
+
+func (z *Writer) init(w io.Writer, level int) {
+ compressor := z.compressor
+ if compressor != nil {
+ compressor.Reset(w)
+ }
+ *z = Writer{
+ Header: Header{
+ OS: 255, // unknown
+ },
+ w: w,
+ level: level,
+ compressor: compressor,
+ }
+}
+
+// Reset discards the Writer z's state and makes it equivalent to the
+// result of its original state from NewWriter or NewWriterLevel, but
+// writing to w instead. This permits reusing a Writer rather than
+// allocating a new one.
+func (z *Writer) Reset(w io.Writer) {
+ z.init(w, z.level)
+}
+
+// writeBytes writes a length-prefixed byte slice to z.w.
+func (z *Writer) writeBytes(b []byte) error {
+ if len(b) > 0xffff {
+ return errors.New("gzip.Write: Extra data is too large")
+ }
+ le.PutUint16(z.buf[:2], uint16(len(b)))
+ _, err := z.w.Write(z.buf[:2])
+ if err != nil {
+ return err
+ }
+ _, err = z.w.Write(b)
+ return err
+}
+
+// writeString writes a UTF-8 string s in GZIP's format to z.w.
+// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
+func (z *Writer) writeString(s string) (err error) {
+ // GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
+ needconv := false
+ for _, v := range s {
+ if v == 0 || v > 0xff {
+ return errors.New("gzip.Write: non-Latin-1 header string")
+ }
+ if v > 0x7f {
+ needconv = true
+ }
+ }
+ if needconv {
+ b := make([]byte, 0, len(s))
+ for _, v := range s {
+ b = append(b, byte(v))
+ }
+ _, err = z.w.Write(b)
+ } else {
+ _, err = io.WriteString(z.w, s)
+ }
+ if err != nil {
+ return err
+ }
+ // GZIP strings are NUL-terminated.
+ z.buf[0] = 0
+ _, err = z.w.Write(z.buf[:1])
+ return err
+}
+
+// Write writes a compressed form of p to the underlying io.Writer. The
+// compressed bytes are not necessarily flushed until the Writer is closed.
+func (z *Writer) Write(p []byte) (int, error) {
+ if z.err != nil {
+ return 0, z.err
+ }
+ var n int
+ // Write the GZIP header lazily.
+ if !z.wroteHeader {
+ z.wroteHeader = true
+ z.buf[0] = gzipID1
+ z.buf[1] = gzipID2
+ z.buf[2] = gzipDeflate
+ z.buf[3] = 0
+ if z.Extra != nil {
+ z.buf[3] |= 0x04
+ }
+ if z.Name != "" {
+ z.buf[3] |= 0x08
+ }
+ if z.Comment != "" {
+ z.buf[3] |= 0x10
+ }
+ le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
+ if z.level == BestCompression {
+ z.buf[8] = 2
+ } else if z.level == BestSpeed {
+ z.buf[8] = 4
+ } else {
+ z.buf[8] = 0
+ }
+ z.buf[9] = z.OS
+ n, z.err = z.w.Write(z.buf[:10])
+ if z.err != nil {
+ return n, z.err
+ }
+ if z.Extra != nil {
+ z.err = z.writeBytes(z.Extra)
+ if z.err != nil {
+ return n, z.err
+ }
+ }
+ if z.Name != "" {
+ z.err = z.writeString(z.Name)
+ if z.err != nil {
+ return n, z.err
+ }
+ }
+ if z.Comment != "" {
+ z.err = z.writeString(z.Comment)
+ if z.err != nil {
+ return n, z.err
+ }
+ }
+ if z.compressor == nil {
+ z.compressor, _ = flate.NewWriter(z.w, z.level)
+ }
+ }
+ z.size += uint32(len(p))
+ z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+ n, z.err = z.compressor.Write(p)
+ return n, z.err
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+//
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet. Flush does
+// not return until the data has been written. If the underlying
+// writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (z *Writer) Flush() error {
+ if z.err != nil {
+ return z.err
+ }
+ if z.closed {
+ return nil
+ }
+ if !z.wroteHeader {
+ z.Write(nil)
+ if z.err != nil {
+ return z.err
+ }
+ }
+ z.err = z.compressor.Flush()
+ return z.err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+ if z.err != nil {
+ return z.err
+ }
+ if z.closed {
+ return nil
+ }
+ z.closed = true
+ if !z.wroteHeader {
+ z.Write(nil)
+ if z.err != nil {
+ return z.err
+ }
+ }
+ z.err = z.compressor.Close()
+ if z.err != nil {
+ return z.err
+ }
+ le.PutUint32(z.buf[:4], z.digest)
+ le.PutUint32(z.buf[4:8], z.size)
+ _, z.err = z.w.Write(z.buf[:8])
+ return z.err
+}
diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE
new file mode 100644
index 0000000000..5cec7ee949
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/vendor/github.com/klauspost/cpuid/README.md b/vendor/github.com/klauspost/cpuid/README.md
new file mode 100644
index 0000000000..b2b6bee879
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/README.md
@@ -0,0 +1,145 @@
+# cpuid
+Package cpuid provides information about the CPU running the current program.
+
+CPU features are detected on startup, and kept for fast access through the life of the application.
+Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
+
+You can access the CPU information by accessing the shared CPU variable of the cpuid library.
+
+Package home: https://github.com/klauspost/cpuid
+
+[![GoDoc][1]][2] [![Build Status][3]][4]
+
+[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
+[2]: https://godoc.org/github.com/klauspost/cpuid
+[3]: https://travis-ci.org/klauspost/cpuid.svg
+[4]: https://travis-ci.org/klauspost/cpuid
+
+# features
+## CPU Instructions
+* **CMOV** (i686 CMOV)
+* **NX** (NX (No-Execute) bit)
+* **AMD3DNOW** (AMD 3DNOW)
+* **AMD3DNOWEXT** (AMD 3DNowExt)
+* **MMX** (standard MMX)
+* **MMXEXT** (SSE integer functions or AMD MMX ext)
+* **SSE** (SSE functions)
+* **SSE2** (P4 SSE functions)
+* **SSE3** (Prescott SSE3 functions)
+* **SSSE3** (Conroe SSSE3 functions)
+* **SSE4** (Penryn SSE4.1 functions)
+* **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
+* **SSE42** (Nehalem SSE4.2 functions)
+* **AVX** (AVX functions)
+* **AVX2** (AVX2 functions)
+* **FMA3** (Intel FMA 3)
+* **FMA4** (Bulldozer FMA4 functions)
+* **XOP** (Bulldozer XOP functions)
+* **F16C** (Half-precision floating-point conversion)
+* **BMI1** (Bit Manipulation Instruction Set 1)
+* **BMI2** (Bit Manipulation Instruction Set 2)
+* **TBM** (AMD Trailing Bit Manipulation)
+* **LZCNT** (LZCNT instruction)
+* **POPCNT** (POPCNT instruction)
+* **AESNI** (Advanced Encryption Standard New Instructions)
+* **CLMUL** (Carry-less Multiplication)
+* **HTT** (Hyperthreading (enabled))
+* **HLE** (Hardware Lock Elision)
+* **RTM** (Restricted Transactional Memory)
+* **RDRAND** (RDRAND instruction is available)
+* **RDSEED** (RDSEED instruction is available)
+* **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
+* **SHA** (Intel SHA Extensions)
+* **AVX512F** (AVX-512 Foundation)
+* **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
+* **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
+* **AVX512PF** (AVX-512 Prefetch Instructions)
+* **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
+* **AVX512CD** (AVX-512 Conflict Detection Instructions)
+* **AVX512BW** (AVX-512 Byte and Word Instructions)
+* **AVX512VL** (AVX-512 Vector Length Extensions)
+* **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
+* **MPX** (Intel MPX (Memory Protection Extensions))
+* **ERMS** (Enhanced REP MOVSB/STOSB)
+* **RDTSCP** (RDTSCP Instruction)
+* **CX16** (CMPXCHG16B Instruction)
+* **SGX** (Software Guard Extensions, with activation details)
+
+## Performance
+* **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
+* **SSE2SLOW** (SSE2 is supported, but usually not faster)
+* **SSE3SLOW** (SSE3 is supported, but usually not faster)
+* **ATOM** (Atom processor, some SSSE3 instructions are slower)
+* **Cache line** (Probable size of a cache line).
+* **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
+
+## Cpu Vendor/VM
+* **Intel**
+* **AMD**
+* **VIA**
+* **Transmeta**
+* **NSC**
+* **KVM** (Kernel-based Virtual Machine)
+* **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
+* **VMware**
+* **XenHVM**
+
+# installing
+
+```go get github.com/klauspost/cpuid```
+
+# example
+
+```Go
+package main
+
+import (
+ "fmt"
+ "github.com/klauspost/cpuid"
+)
+
+func main() {
+ // Print basic CPU information:
+ fmt.Println("Name:", cpuid.CPU.BrandName)
+ fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
+ fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
+ fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
+ fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
+ fmt.Println("Features:", cpuid.CPU.Features)
+ fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
+ fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
+ fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
+ fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
+ fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
+
+ // Test if we have a specific feature:
+ if cpuid.CPU.SSE() {
+ fmt.Println("We have Streaming SIMD Extensions")
+ }
+}
+```
+
+Sample output:
+```
+>go run main.go
+Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
+PhysicalCores: 2
+ThreadsPerCore: 2
+LogicalCores: 4
+Family 6 Model: 42
+Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
+Cacheline bytes: 64
+We have Streaming SIMD Extensions
+```
+
+# private package
+
+In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
+
+For this purpose all exports are removed, and functions and constants are lowercased.
+
+This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
+
+# license
+
+This code is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go
new file mode 100644
index 0000000000..9230ca5628
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/cpuid.go
@@ -0,0 +1,1022 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// Package cpuid provides information about the CPU running the current program.
+//
+// CPU features are detected on startup, and kept for fast access through the life of the application.
+// Currently x86 / x64 (AMD64) is supported.
+//
+// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
+//
+// Package home: https://github.com/klauspost/cpuid
+package cpuid
+
+import "strings"
+
+// Vendor is a representation of a CPU vendor.
+type Vendor int
+
+const (
+ Other Vendor = iota
+ Intel
+ AMD
+ VIA
+ Transmeta
+ NSC
+ KVM // Kernel-based Virtual Machine
+ MSVM // Microsoft Hyper-V or Windows Virtual PC
+ VMware
+ XenHVM
+)
+
+const (
+ CMOV = 1 << iota // i686 CMOV
+ NX // NX (No-Execute) bit
+ AMD3DNOW // AMD 3DNOW
+ AMD3DNOWEXT // AMD 3DNowExt
+ MMX // standard MMX
+ MMXEXT // SSE integer functions or AMD MMX ext
+ SSE // SSE functions
+ SSE2 // P4 SSE functions
+ SSE3 // Prescott SSE3 functions
+ SSSE3 // Conroe SSSE3 functions
+ SSE4 // Penryn SSE4.1 functions
+ SSE4A // AMD Barcelona microarchitecture SSE4a instructions
+ SSE42 // Nehalem SSE4.2 functions
+ AVX // AVX functions
+ AVX2 // AVX2 functions
+ FMA3 // Intel FMA 3
+ FMA4 // Bulldozer FMA4 functions
+ XOP // Bulldozer XOP functions
+ F16C // Half-precision floating-point conversion
+ BMI1 // Bit Manipulation Instruction Set 1
+ BMI2 // Bit Manipulation Instruction Set 2
+ TBM // AMD Trailing Bit Manipulation
+ LZCNT // LZCNT instruction
+ POPCNT // POPCNT instruction
+ AESNI // Advanced Encryption Standard New Instructions
+ CLMUL // Carry-less Multiplication
+ HTT // Hyperthreading (enabled)
+ HLE // Hardware Lock Elision
+ RTM // Restricted Transactional Memory
+ RDRAND // RDRAND instruction is available
+ RDSEED // RDSEED instruction is available
+ ADX // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+ SHA // Intel SHA Extensions
+ AVX512F // AVX-512 Foundation
+ AVX512DQ // AVX-512 Doubleword and Quadword Instructions
+ AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions
+ AVX512PF // AVX-512 Prefetch Instructions
+ AVX512ER // AVX-512 Exponential and Reciprocal Instructions
+ AVX512CD // AVX-512 Conflict Detection Instructions
+ AVX512BW // AVX-512 Byte and Word Instructions
+ AVX512VL // AVX-512 Vector Length Extensions
+ AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions
+ MPX // Intel MPX (Memory Protection Extensions)
+ ERMS // Enhanced REP MOVSB/STOSB
+ RDTSCP // RDTSCP Instruction
+ CX16 // CMPXCHG16B Instruction
+ SGX // Software Guard Extensions
+
+ // Performance indicators
+ SSE2SLOW // SSE2 is supported, but usually not faster
+ SSE3SLOW // SSE3 is supported, but usually not faster
+ ATOM // Atom processor, some SSSE3 instructions are slower
+)
+
+var flagNames = map[Flags]string{
+ CMOV: "CMOV", // i686 CMOV
+ NX: "NX", // NX (No-Execute) bit
+ AMD3DNOW: "AMD3DNOW", // AMD 3DNOW
+ AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt
+ MMX: "MMX", // Standard MMX
+ MMXEXT: "MMXEXT", // SSE integer functions or AMD MMX ext
+ SSE: "SSE", // SSE functions
+ SSE2: "SSE2", // P4 SSE2 functions
+ SSE3: "SSE3", // Prescott SSE3 functions
+ SSSE3: "SSSE3", // Conroe SSSE3 functions
+ SSE4: "SSE4.1", // Penryn SSE4.1 functions
+ SSE4A: "SSE4A", // AMD Barcelona microarchitecture SSE4a instructions
+ SSE42: "SSE4.2", // Nehalem SSE4.2 functions
+ AVX: "AVX", // AVX functions
+ AVX2: "AVX2", // AVX functions
+ FMA3: "FMA3", // Intel FMA 3
+ FMA4: "FMA4", // Bulldozer FMA4 functions
+ XOP: "XOP", // Bulldozer XOP functions
+ F16C: "F16C", // Half-precision floating-point conversion
+ BMI1: "BMI1", // Bit Manipulation Instruction Set 1
+ BMI2: "BMI2", // Bit Manipulation Instruction Set 2
+ TBM: "TBM", // AMD Trailing Bit Manipulation
+ LZCNT: "LZCNT", // LZCNT instruction
+ POPCNT: "POPCNT", // POPCNT instruction
+ AESNI: "AESNI", // Advanced Encryption Standard New Instructions
+ CLMUL: "CLMUL", // Carry-less Multiplication
+ HTT: "HTT", // Hyperthreading (enabled)
+ HLE: "HLE", // Hardware Lock Elision
+ RTM: "RTM", // Restricted Transactional Memory
+ RDRAND: "RDRAND", // RDRAND instruction is available
+ RDSEED: "RDSEED", // RDSEED instruction is available
+ ADX: "ADX", // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+ SHA: "SHA", // Intel SHA Extensions
+ AVX512F: "AVX512F", // AVX-512 Foundation
+ AVX512DQ: "AVX512DQ", // AVX-512 Doubleword and Quadword Instructions
+ AVX512IFMA: "AVX512IFMA", // AVX-512 Integer Fused Multiply-Add Instructions
+ AVX512PF: "AVX512PF", // AVX-512 Prefetch Instructions
+ AVX512ER: "AVX512ER", // AVX-512 Exponential and Reciprocal Instructions
+ AVX512CD: "AVX512CD", // AVX-512 Conflict Detection Instructions
+ AVX512BW: "AVX512BW", // AVX-512 Byte and Word Instructions
+ AVX512VL: "AVX512VL", // AVX-512 Vector Length Extensions
+ AVX512VBMI: "AVX512VBMI", // AVX-512 Vector Bit Manipulation Instructions
+ MPX: "MPX", // Intel MPX (Memory Protection Extensions)
+ ERMS: "ERMS", // Enhanced REP MOVSB/STOSB
+ RDTSCP: "RDTSCP", // RDTSCP Instruction
+ CX16: "CX16", // CMPXCHG16B Instruction
+ SGX: "SGX", // Software Guard Extensions
+
+ // Performance indicators
+ SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster
+ SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster
+ ATOM: "ATOM", // Atom processor, some SSSE3 instructions are slower
+
+}
+
+// CPUInfo contains information about the detected system CPU.
+type CPUInfo struct {
+ BrandName string // Brand name reported by the CPU
+ VendorID Vendor // Comparable CPU vendor ID
+ Features Flags // Features of the CPU
+ PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable.
+ ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable.
+ LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
+ Family int // CPU family number
+ Model int // CPU model number
+ CacheLine int // Cache line size in bytes. Will be 0 if undetectable.
+ Cache struct {
+ L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
+ L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
+ L2 int // L2 Cache (per core or shared). Will be -1 if undetected
+ L3 int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
+ }
+ SGX SGXSupport
+ maxFunc uint32
+ maxExFunc uint32
+}
+
+var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
+var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+var xgetbv func(index uint32) (eax, edx uint32)
+var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
+
+// CPU contains information about the CPU as detected on startup,
+// or when Detect last was called.
+//
+// Use this as the primary entry point to you data,
+// this way queries are
+var CPU CPUInfo
+
+func init() {
+ initCPU()
+ Detect()
+}
+
+// Detect will re-detect current CPU info.
+// This will replace the content of the exported CPU variable.
+//
+// Unless you expect the CPU to change while you are running your program
+// you should not need to call this function.
+// If you call this, you must ensure that no other goroutine is accessing the
+// exported CPU variable.
+func Detect() {
+ CPU.maxFunc = maxFunctionID()
+ CPU.maxExFunc = maxExtendedFunction()
+ CPU.BrandName = brandName()
+ CPU.CacheLine = cacheLine()
+ CPU.Family, CPU.Model = familyModel()
+ CPU.Features = support()
+ CPU.SGX = sgx(CPU.Features&SGX != 0)
+ CPU.ThreadsPerCore = threadsPerCore()
+ CPU.LogicalCores = logicalCores()
+ CPU.PhysicalCores = physicalCores()
+ CPU.VendorID = vendorID()
+ CPU.cacheSize()
+}
+
+// Generated here: http://play.golang.org/p/BxFH2Gdc0G
+
+// Cmov indicates support of CMOV instructions
+func (c CPUInfo) Cmov() bool {
+ return c.Features&CMOV != 0
+}
+
+// Amd3dnow indicates support of AMD 3DNOW! instructions
+func (c CPUInfo) Amd3dnow() bool {
+ return c.Features&AMD3DNOW != 0
+}
+
+// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
+func (c CPUInfo) Amd3dnowExt() bool {
+ return c.Features&AMD3DNOWEXT != 0
+}
+
+// MMX indicates support of MMX instructions
+func (c CPUInfo) MMX() bool {
+ return c.Features&MMX != 0
+}
+
+// MMXExt indicates support of MMXEXT instructions
+// (SSE integer functions or AMD MMX ext)
+func (c CPUInfo) MMXExt() bool {
+ return c.Features&MMXEXT != 0
+}
+
+// SSE indicates support of SSE instructions
+func (c CPUInfo) SSE() bool {
+ return c.Features&SSE != 0
+}
+
+// SSE2 indicates support of SSE 2 instructions
+func (c CPUInfo) SSE2() bool {
+ return c.Features&SSE2 != 0
+}
+
+// SSE3 indicates support of SSE 3 instructions
+func (c CPUInfo) SSE3() bool {
+ return c.Features&SSE3 != 0
+}
+
+// SSSE3 indicates support of SSSE 3 instructions
+func (c CPUInfo) SSSE3() bool {
+ return c.Features&SSSE3 != 0
+}
+
+// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
+func (c CPUInfo) SSE4() bool {
+ return c.Features&SSE4 != 0
+}
+
+// SSE42 indicates support of SSE4.2 instructions
+func (c CPUInfo) SSE42() bool {
+ return c.Features&SSE42 != 0
+}
+
+// AVX indicates support of AVX instructions
+// and operating system support of AVX instructions
+func (c CPUInfo) AVX() bool {
+ return c.Features&AVX != 0
+}
+
+// AVX2 indicates support of AVX2 instructions
+func (c CPUInfo) AVX2() bool {
+ return c.Features&AVX2 != 0
+}
+
+// FMA3 indicates support of FMA3 instructions
+func (c CPUInfo) FMA3() bool {
+ return c.Features&FMA3 != 0
+}
+
+// FMA4 indicates support of FMA4 instructions
+func (c CPUInfo) FMA4() bool {
+ return c.Features&FMA4 != 0
+}
+
+// XOP indicates support of XOP instructions
+func (c CPUInfo) XOP() bool {
+ return c.Features&XOP != 0
+}
+
+// F16C indicates support of F16C instructions
+func (c CPUInfo) F16C() bool {
+ return c.Features&F16C != 0
+}
+
+// BMI1 indicates support of BMI1 instructions
+func (c CPUInfo) BMI1() bool {
+ return c.Features&BMI1 != 0
+}
+
+// BMI2 indicates support of BMI2 instructions
+func (c CPUInfo) BMI2() bool {
+ return c.Features&BMI2 != 0
+}
+
+// TBM indicates support of TBM instructions
+// (AMD Trailing Bit Manipulation)
+func (c CPUInfo) TBM() bool {
+ return c.Features&TBM != 0
+}
+
+// Lzcnt indicates support of LZCNT instruction
+func (c CPUInfo) Lzcnt() bool {
+ return c.Features&LZCNT != 0
+}
+
+// Popcnt indicates support of POPCNT instruction
+func (c CPUInfo) Popcnt() bool {
+ return c.Features&POPCNT != 0
+}
+
+// HTT indicates the processor has Hyperthreading enabled
+func (c CPUInfo) HTT() bool {
+ return c.Features&HTT != 0
+}
+
+// SSE2Slow indicates that SSE2 may be slow on this processor
+func (c CPUInfo) SSE2Slow() bool {
+ return c.Features&SSE2SLOW != 0
+}
+
+// SSE3Slow indicates that SSE3 may be slow on this processor
+func (c CPUInfo) SSE3Slow() bool {
+ return c.Features&SSE3SLOW != 0
+}
+
+// AesNi indicates support of AES-NI instructions
+// (Advanced Encryption Standard New Instructions)
+func (c CPUInfo) AesNi() bool {
+ return c.Features&AESNI != 0
+}
+
+// Clmul indicates support of CLMUL instructions
+// (Carry-less Multiplication)
+func (c CPUInfo) Clmul() bool {
+ return c.Features&CLMUL != 0
+}
+
+// NX indicates support of NX (No-Execute) bit
+func (c CPUInfo) NX() bool {
+ return c.Features&NX != 0
+}
+
+// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
+func (c CPUInfo) SSE4A() bool {
+ return c.Features&SSE4A != 0
+}
+
+// HLE indicates support of Hardware Lock Elision
+func (c CPUInfo) HLE() bool {
+ return c.Features&HLE != 0
+}
+
+// RTM indicates support of Restricted Transactional Memory
+func (c CPUInfo) RTM() bool {
+ return c.Features&RTM != 0
+}
+
+// Rdrand indicates support of RDRAND instruction is available
+func (c CPUInfo) Rdrand() bool {
+ return c.Features&RDRAND != 0
+}
+
+// Rdseed indicates support of RDSEED instruction is available
+func (c CPUInfo) Rdseed() bool {
+ return c.Features&RDSEED != 0
+}
+
+// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+func (c CPUInfo) ADX() bool {
+ return c.Features&ADX != 0
+}
+
+// SHA indicates support of Intel SHA Extensions
+func (c CPUInfo) SHA() bool {
+ return c.Features&SHA != 0
+}
+
+// AVX512F indicates support of AVX-512 Foundation
+func (c CPUInfo) AVX512F() bool {
+ return c.Features&AVX512F != 0
+}
+
+// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
+func (c CPUInfo) AVX512DQ() bool {
+ return c.Features&AVX512DQ != 0
+}
+
+// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
+func (c CPUInfo) AVX512IFMA() bool {
+ return c.Features&AVX512IFMA != 0
+}
+
+// AVX512PF indicates support of AVX-512 Prefetch Instructions
+func (c CPUInfo) AVX512PF() bool {
+ return c.Features&AVX512PF != 0
+}
+
+// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
+func (c CPUInfo) AVX512ER() bool {
+ return c.Features&AVX512ER != 0
+}
+
+// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
+func (c CPUInfo) AVX512CD() bool {
+ return c.Features&AVX512CD != 0
+}
+
+// AVX512BW indicates support of AVX-512 Byte and Word Instructions
+func (c CPUInfo) AVX512BW() bool {
+ return c.Features&AVX512BW != 0
+}
+
+// AVX512VL indicates support of AVX-512 Vector Length Extensions
+func (c CPUInfo) AVX512VL() bool {
+ return c.Features&AVX512VL != 0
+}
+
+// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
+func (c CPUInfo) AVX512VBMI() bool {
+ return c.Features&AVX512VBMI != 0
+}
+
+// MPX indicates support of Intel MPX (Memory Protection Extensions)
+func (c CPUInfo) MPX() bool {
+ return c.Features&MPX != 0
+}
+
+// ERMS indicates support of Enhanced REP MOVSB/STOSB
+func (c CPUInfo) ERMS() bool {
+ return c.Features&ERMS != 0
+}
+
+func (c CPUInfo) RDTSCP() bool {
+ return c.Features&RDTSCP != 0
+}
+
+func (c CPUInfo) CX16() bool {
+ return c.Features&CX16 != 0
+}
+
+// Atom indicates an Atom processor
+func (c CPUInfo) Atom() bool {
+ return c.Features&ATOM != 0
+}
+
+// Intel returns true if vendor is recognized as Intel
+func (c CPUInfo) Intel() bool {
+ return c.VendorID == Intel
+}
+
+// AMD returns true if vendor is recognized as AMD
+func (c CPUInfo) AMD() bool {
+ return c.VendorID == AMD
+}
+
+// Transmeta returns true if vendor is recognized as Transmeta
+func (c CPUInfo) Transmeta() bool {
+ return c.VendorID == Transmeta
+}
+
+// NSC returns true if vendor is recognized as National Semiconductor
+func (c CPUInfo) NSC() bool {
+ return c.VendorID == NSC
+}
+
+// VIA returns true if vendor is recognized as VIA
+func (c CPUInfo) VIA() bool {
+ return c.VendorID == VIA
+}
+
+// RTCounter returns the 64-bit time-stamp counter
+// Uses the RDTSCP instruction. The value 0 is returned
+// if the CPU does not support the instruction.
+func (c CPUInfo) RTCounter() uint64 {
+ if !c.RDTSCP() {
+ return 0
+ }
+ a, _, _, d := rdtscpAsm()
+ return uint64(a) | (uint64(d) << 32)
+}
+
+// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
+// This variable is OS dependent, but on Linux contains information
+// about the current cpu/core the code is running on.
+// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
+func (c CPUInfo) Ia32TscAux() uint32 {
+ if !c.RDTSCP() {
+ return 0
+ }
+ _, _, ecx, _ := rdtscpAsm()
+ return ecx
+}
+
+// LogicalCPU will return the Logical CPU the code is currently executing on.
+// This is likely to change when the OS re-schedules the running thread
+// to another CPU.
+// If the current core cannot be detected, -1 will be returned.
+func (c CPUInfo) LogicalCPU() int {
+ if c.maxFunc < 1 {
+ return -1
+ }
+ _, ebx, _, _ := cpuid(1)
+ return int(ebx >> 24)
+}
+
+// VM Will return true if the cpu id indicates we are in
+// a virtual machine. This is only a hint, and will very likely
+// have many false negatives.
+func (c CPUInfo) VM() bool {
+ switch c.VendorID {
+ case MSVM, KVM, VMware, XenHVM:
+ return true
+ }
+ return false
+}
+
+// Flags contains detected cpu features and caracteristics
+type Flags uint64
+
+// String returns a string representation of the detected
+// CPU features.
+func (f Flags) String() string {
+ return strings.Join(f.Strings(), ",")
+}
+
+// Strings returns and array of the detected features.
+func (f Flags) Strings() []string {
+ s := support()
+ r := make([]string, 0, 20)
+ for i := uint(0); i < 64; i++ {
+ key := Flags(1 << i)
+ val := flagNames[key]
+ if s&key != 0 {
+ r = append(r, val)
+ }
+ }
+ return r
+}
+
+func maxExtendedFunction() uint32 {
+ eax, _, _, _ := cpuid(0x80000000)
+ return eax
+}
+
+func maxFunctionID() uint32 {
+ a, _, _, _ := cpuid(0)
+ return a
+}
+
+func brandName() string {
+ if maxExtendedFunction() >= 0x80000004 {
+ v := make([]uint32, 0, 48)
+ for i := uint32(0); i < 3; i++ {
+ a, b, c, d := cpuid(0x80000002 + i)
+ v = append(v, a, b, c, d)
+ }
+ return strings.Trim(string(valAsString(v...)), " ")
+ }
+ return "unknown"
+}
+
+func threadsPerCore() int {
+ mfi := maxFunctionID()
+ if mfi < 0x4 || vendorID() != Intel {
+ return 1
+ }
+
+ if mfi < 0xb {
+ _, b, _, d := cpuid(1)
+ if (d & (1 << 28)) != 0 {
+ // v will contain logical core count
+ v := (b >> 16) & 255
+ if v > 1 {
+ a4, _, _, _ := cpuid(4)
+ // physical cores
+ v2 := (a4 >> 26) + 1
+ if v2 > 0 {
+ return int(v) / int(v2)
+ }
+ }
+ }
+ return 1
+ }
+ _, b, _, _ := cpuidex(0xb, 0)
+ if b&0xffff == 0 {
+ return 1
+ }
+ return int(b & 0xffff)
+}
+
+func logicalCores() int {
+ mfi := maxFunctionID()
+ switch vendorID() {
+ case Intel:
+ // Use this on old Intel processors
+ if mfi < 0xb {
+ if mfi < 1 {
+ return 0
+ }
+ // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
+ // that can be assigned to logical processors in a physical package.
+ // The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
+ _, ebx, _, _ := cpuid(1)
+ logical := (ebx >> 16) & 0xff
+ return int(logical)
+ }
+ _, b, _, _ := cpuidex(0xb, 1)
+ return int(b & 0xffff)
+ case AMD:
+ _, b, _, _ := cpuid(1)
+ return int((b >> 16) & 0xff)
+ default:
+ return 0
+ }
+}
+
+func familyModel() (int, int) {
+ if maxFunctionID() < 0x1 {
+ return 0, 0
+ }
+ eax, _, _, _ := cpuid(1)
+ family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
+ model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
+ return int(family), int(model)
+}
+
+func physicalCores() int {
+ switch vendorID() {
+ case Intel:
+ return logicalCores() / threadsPerCore()
+ case AMD:
+ if maxExtendedFunction() >= 0x80000008 {
+ _, _, c, _ := cpuid(0x80000008)
+ return int(c&0xff) + 1
+ }
+ }
+ return 0
+}
+
+// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
+var vendorMapping = map[string]Vendor{
+ "AMDisbetter!": AMD,
+ "AuthenticAMD": AMD,
+ "CentaurHauls": VIA,
+ "GenuineIntel": Intel,
+ "TransmetaCPU": Transmeta,
+ "GenuineTMx86": Transmeta,
+ "Geode by NSC": NSC,
+ "VIA VIA VIA ": VIA,
+ "KVMKVMKVMKVM": KVM,
+ "Microsoft Hv": MSVM,
+ "VMwareVMware": VMware,
+ "XenVMMXenVMM": XenHVM,
+}
+
+func vendorID() Vendor {
+ _, b, c, d := cpuid(0)
+ v := valAsString(b, d, c)
+ vend, ok := vendorMapping[string(v)]
+ if !ok {
+ return Other
+ }
+ return vend
+}
+
+func cacheLine() int {
+ if maxFunctionID() < 0x1 {
+ return 0
+ }
+
+ _, ebx, _, _ := cpuid(1)
+ cache := (ebx & 0xff00) >> 5 // cflush size
+ if cache == 0 && maxExtendedFunction() >= 0x80000006 {
+ _, _, ecx, _ := cpuid(0x80000006)
+ cache = ecx & 0xff // cacheline size
+ }
+ // TODO: Read from Cache and TLB Information
+ return int(cache)
+}
+
+func (c *CPUInfo) cacheSize() {
+ c.Cache.L1D = -1
+ c.Cache.L1I = -1
+ c.Cache.L2 = -1
+ c.Cache.L3 = -1
+ vendor := vendorID()
+ switch vendor {
+ case Intel:
+ if maxFunctionID() < 4 {
+ return
+ }
+ for i := uint32(0); ; i++ {
+ eax, ebx, ecx, _ := cpuidex(4, i)
+ cacheType := eax & 15
+ if cacheType == 0 {
+ break
+ }
+ cacheLevel := (eax >> 5) & 7
+ coherency := int(ebx&0xfff) + 1
+ partitions := int((ebx>>12)&0x3ff) + 1
+ associativity := int((ebx>>22)&0x3ff) + 1
+ sets := int(ecx) + 1
+ size := associativity * partitions * coherency * sets
+ switch cacheLevel {
+ case 1:
+ if cacheType == 1 {
+ // 1 = Data Cache
+ c.Cache.L1D = size
+ } else if cacheType == 2 {
+ // 2 = Instruction Cache
+ c.Cache.L1I = size
+ } else {
+ if c.Cache.L1D < 0 {
+ c.Cache.L1I = size
+ }
+ if c.Cache.L1I < 0 {
+ c.Cache.L1I = size
+ }
+ }
+ case 2:
+ c.Cache.L2 = size
+ case 3:
+ c.Cache.L3 = size
+ }
+ }
+ case AMD:
+ // Untested.
+ if maxExtendedFunction() < 0x80000005 {
+ return
+ }
+ _, _, ecx, edx := cpuid(0x80000005)
+ c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
+ c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
+
+ if maxExtendedFunction() < 0x80000006 {
+ return
+ }
+ _, _, ecx, _ = cpuid(0x80000006)
+ c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
+ }
+
+ return
+}
+
+type SGXSupport struct {
+ Available bool
+ SGX1Supported bool
+ SGX2Supported bool
+ MaxEnclaveSizeNot64 int64
+ MaxEnclaveSize64 int64
+}
+
+func sgx(available bool) (rval SGXSupport) {
+ rval.Available = available
+
+ if !available {
+ return
+ }
+
+ a, _, _, d := cpuidex(0x12, 0)
+ rval.SGX1Supported = a&0x01 != 0
+ rval.SGX2Supported = a&0x02 != 0
+ rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2
+ rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
+
+ return
+}
+
+func support() Flags {
+ mfi := maxFunctionID()
+ vend := vendorID()
+ if mfi < 0x1 {
+ return 0
+ }
+ rval := uint64(0)
+ _, _, c, d := cpuid(1)
+ if (d & (1 << 15)) != 0 {
+ rval |= CMOV
+ }
+ if (d & (1 << 23)) != 0 {
+ rval |= MMX
+ }
+ if (d & (1 << 25)) != 0 {
+ rval |= MMXEXT
+ }
+ if (d & (1 << 25)) != 0 {
+ rval |= SSE
+ }
+ if (d & (1 << 26)) != 0 {
+ rval |= SSE2
+ }
+ if (c & 1) != 0 {
+ rval |= SSE3
+ }
+ if (c & 0x00000200) != 0 {
+ rval |= SSSE3
+ }
+ if (c & 0x00080000) != 0 {
+ rval |= SSE4
+ }
+ if (c & 0x00100000) != 0 {
+ rval |= SSE42
+ }
+ if (c & (1 << 25)) != 0 {
+ rval |= AESNI
+ }
+ if (c & (1 << 1)) != 0 {
+ rval |= CLMUL
+ }
+ if c&(1<<23) != 0 {
+ rval |= POPCNT
+ }
+ if c&(1<<30) != 0 {
+ rval |= RDRAND
+ }
+ if c&(1<<29) != 0 {
+ rval |= F16C
+ }
+ if c&(1<<13) != 0 {
+ rval |= CX16
+ }
+ if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
+ if threadsPerCore() > 1 {
+ rval |= HTT
+ }
+ }
+
+ // Check XGETBV, OXSAVE and AVX bits
+ if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
+ // Check for OS support
+ eax, _ := xgetbv(0)
+ if (eax & 0x6) == 0x6 {
+ rval |= AVX
+ if (c & 0x00001000) != 0 {
+ rval |= FMA3
+ }
+ }
+ }
+
+ // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
+ if mfi >= 7 {
+ _, ebx, ecx, _ := cpuidex(7, 0)
+ if (rval&AVX) != 0 && (ebx&0x00000020) != 0 {
+ rval |= AVX2
+ }
+ if (ebx & 0x00000008) != 0 {
+ rval |= BMI1
+ if (ebx & 0x00000100) != 0 {
+ rval |= BMI2
+ }
+ }
+ if ebx&(1<<2) != 0 {
+ rval |= SGX
+ }
+ if ebx&(1<<4) != 0 {
+ rval |= HLE
+ }
+ if ebx&(1<<9) != 0 {
+ rval |= ERMS
+ }
+ if ebx&(1<<11) != 0 {
+ rval |= RTM
+ }
+ if ebx&(1<<14) != 0 {
+ rval |= MPX
+ }
+ if ebx&(1<<18) != 0 {
+ rval |= RDSEED
+ }
+ if ebx&(1<<19) != 0 {
+ rval |= ADX
+ }
+ if ebx&(1<<29) != 0 {
+ rval |= SHA
+ }
+
+ // Only detect AVX-512 features if XGETBV is supported
+ if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
+ // Check for OS support
+ eax, _ := xgetbv(0)
+
+ // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+ // ZMM16-ZMM31 state are enabled by OS)
+ /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
+ if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
+ if ebx&(1<<16) != 0 {
+ rval |= AVX512F
+ }
+ if ebx&(1<<17) != 0 {
+ rval |= AVX512DQ
+ }
+ if ebx&(1<<21) != 0 {
+ rval |= AVX512IFMA
+ }
+ if ebx&(1<<26) != 0 {
+ rval |= AVX512PF
+ }
+ if ebx&(1<<27) != 0 {
+ rval |= AVX512ER
+ }
+ if ebx&(1<<28) != 0 {
+ rval |= AVX512CD
+ }
+ if ebx&(1<<30) != 0 {
+ rval |= AVX512BW
+ }
+ if ebx&(1<<31) != 0 {
+ rval |= AVX512VL
+ }
+ // ecx
+ if ecx&(1<<1) != 0 {
+ rval |= AVX512VBMI
+ }
+ }
+ }
+ }
+
+ if maxExtendedFunction() >= 0x80000001 {
+ _, _, c, d := cpuid(0x80000001)
+ if (c & (1 << 5)) != 0 {
+ rval |= LZCNT
+ rval |= POPCNT
+ }
+ if (d & (1 << 31)) != 0 {
+ rval |= AMD3DNOW
+ }
+ if (d & (1 << 30)) != 0 {
+ rval |= AMD3DNOWEXT
+ }
+ if (d & (1 << 23)) != 0 {
+ rval |= MMX
+ }
+ if (d & (1 << 22)) != 0 {
+ rval |= MMXEXT
+ }
+ if (c & (1 << 6)) != 0 {
+ rval |= SSE4A
+ }
+ if d&(1<<20) != 0 {
+ rval |= NX
+ }
+ if d&(1<<27) != 0 {
+ rval |= RDTSCP
+ }
+
+ /* Allow for selectively disabling SSE2 functions on AMD processors
+ with SSE2 support but not SSE4a. This includes Athlon64, some
+ Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
+ than SSE2 often enough to utilize this special-case flag.
+ AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
+ so that SSE2 is used unless explicitly disabled by checking
+ AV_CPU_FLAG_SSE2SLOW. */
+ if vendorID() != Intel &&
+ rval&SSE2 != 0 && (c&0x00000040) == 0 {
+ rval |= SSE2SLOW
+ }
+
+ /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
+ * used unless the OS has AVX support. */
+ if (rval & AVX) != 0 {
+ if (c & 0x00000800) != 0 {
+ rval |= XOP
+ }
+ if (c & 0x00010000) != 0 {
+ rval |= FMA4
+ }
+ }
+
+ if vendorID() == Intel {
+ family, model := familyModel()
+ if family == 6 && (model == 9 || model == 13 || model == 14) {
+ /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
+ * 6/14 (core1 "yonah") theoretically support sse2, but it's
+ * usually slower than mmx. */
+ if (rval & SSE2) != 0 {
+ rval |= SSE2SLOW
+ }
+ if (rval & SSE3) != 0 {
+ rval |= SSE3SLOW
+ }
+ }
+ /* The Atom processor has SSSE3 support, which is useful in many cases,
+ * but sometimes the SSSE3 version is slower than the SSE2 equivalent
+ * on the Atom, but is generally faster on other processors supporting
+ * SSSE3. This flag allows for selectively disabling certain SSSE3
+ * functions on the Atom. */
+ if family == 6 && model == 28 {
+ rval |= ATOM
+ }
+ }
+ }
+ return Flags(rval)
+}
+
+func valAsString(values ...uint32) []byte {
+ r := make([]byte, 4*len(values))
+ for i, v := range values {
+ dst := r[i*4:]
+ dst[0] = byte(v & 0xff)
+ dst[1] = byte((v >> 8) & 0xff)
+ dst[2] = byte((v >> 16) & 0xff)
+ dst[3] = byte((v >> 24) & 0xff)
+ switch {
+ case dst[0] == 0:
+ return r[:i*4]
+ case dst[1] == 0:
+ return r[:i*4+1]
+ case dst[2] == 0:
+ return r[:i*4+2]
+ case dst[3] == 0:
+ return r[:i*4+3]
+ }
+ }
+ return r
+}
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s
new file mode 100644
index 0000000000..4d731711e4
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/cpuid_386.s
@@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+ XORL CX, CX
+ MOVL op+0(FP), AX
+ CPUID
+ MOVL AX, eax+4(FP)
+ MOVL BX, ebx+8(FP)
+ MOVL CX, ecx+12(FP)
+ MOVL DX, edx+16(FP)
+ RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+ MOVL op+0(FP), AX
+ MOVL op2+4(FP), CX
+ CPUID
+ MOVL AX, eax+8(FP)
+ MOVL BX, ebx+12(FP)
+ MOVL CX, ecx+16(FP)
+ MOVL DX, edx+20(FP)
+ RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+ MOVL index+0(FP), CX
+ BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+ MOVL AX, eax+4(FP)
+ MOVL DX, edx+8(FP)
+ RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+ BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+ MOVL AX, eax+0(FP)
+ MOVL BX, ebx+4(FP)
+ MOVL CX, ecx+8(FP)
+ MOVL DX, edx+12(FP)
+ RET
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
new file mode 100644
index 0000000000..3c1d60e422
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
@@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+//+build amd64,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+ XORQ CX, CX
+ MOVL op+0(FP), AX
+ CPUID
+ MOVL AX, eax+8(FP)
+ MOVL BX, ebx+12(FP)
+ MOVL CX, ecx+16(FP)
+ MOVL DX, edx+20(FP)
+ RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+ MOVL op+0(FP), AX
+ MOVL op2+4(FP), CX
+ CPUID
+ MOVL AX, eax+8(FP)
+ MOVL BX, ebx+12(FP)
+ MOVL CX, ecx+16(FP)
+ MOVL DX, edx+20(FP)
+ RET
+
+// func asmXgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+ MOVL index+0(FP), CX
+ BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+ MOVL AX, eax+8(FP)
+ MOVL DX, edx+12(FP)
+ RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+ BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+ MOVL AX, eax+0(FP)
+ MOVL BX, ebx+4(FP)
+ MOVL CX, ecx+8(FP)
+ MOVL DX, edx+12(FP)
+ RET
diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go
new file mode 100644
index 0000000000..a5f04dd6d0
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/detect_intel.go
@@ -0,0 +1,17 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo amd64,!gccgo
+
+package cpuid
+
+func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func asmXgetbv(index uint32) (eax, edx uint32)
+func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+
+func initCPU() {
+ cpuid = asmCpuid
+ cpuidex = asmCpuidex
+ xgetbv = asmXgetbv
+ rdtscpAsm = asmRdtscpAsm
+}
diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go
new file mode 100644
index 0000000000..909c5d9a7a
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/detect_ref.go
@@ -0,0 +1,23 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build !amd64,!386 gccgo
+
+package cpuid
+
+func initCPU() {
+ cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
+ return 0, 0, 0, 0
+ }
+
+ cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+ return 0, 0, 0, 0
+ }
+
+ xgetbv = func(index uint32) (eax, edx uint32) {
+ return 0, 0
+ }
+
+ rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
+ return 0, 0, 0, 0
+ }
+}
diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go
new file mode 100644
index 0000000000..c060b8165e
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/generate.go
@@ -0,0 +1,3 @@
+package cpuid
+
+//go:generate go run private-gen.go
diff --git a/vendor/github.com/klauspost/cpuid/private-gen.go b/vendor/github.com/klauspost/cpuid/private-gen.go
new file mode 100644
index 0000000000..437333d292
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/private-gen.go
@@ -0,0 +1,476 @@
+// +build ignore
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "go/ast"
+ "go/parser"
+ "go/printer"
+ "go/token"
+ "io"
+ "io/ioutil"
+ "log"
+ "os"
+ "reflect"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+var inFiles = []string{"cpuid.go", "cpuid_test.go"}
+var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"}
+var fileSet = token.NewFileSet()
+var reWrites = []rewrite{
+ initRewrite("CPUInfo -> cpuInfo"),
+ initRewrite("Vendor -> vendor"),
+ initRewrite("Flags -> flags"),
+ initRewrite("Detect -> detect"),
+ initRewrite("CPU -> cpu"),
+}
+var excludeNames = map[string]bool{"string": true, "join": true, "trim": true,
+ // cpuid_test.go
+ "t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true,
+}
+
+var excludePrefixes = []string{"test", "benchmark"}
+
+func main() {
+ Package := "private"
+ parserMode := parser.ParseComments
+ exported := make(map[string]rewrite)
+ for _, file := range inFiles {
+ in, err := os.Open(file)
+ if err != nil {
+ log.Fatalf("opening input", err)
+ }
+
+ src, err := ioutil.ReadAll(in)
+ if err != nil {
+ log.Fatalf("reading input", err)
+ }
+
+ astfile, err := parser.ParseFile(fileSet, file, src, parserMode)
+ if err != nil {
+ log.Fatalf("parsing input", err)
+ }
+
+ for _, rw := range reWrites {
+ astfile = rw(astfile)
+ }
+
+ // Inspect the AST and print all identifiers and literals.
+ var startDecl token.Pos
+ var endDecl token.Pos
+ ast.Inspect(astfile, func(n ast.Node) bool {
+ var s string
+ switch x := n.(type) {
+ case *ast.Ident:
+ if x.IsExported() {
+ t := strings.ToLower(x.Name)
+ for _, pre := range excludePrefixes {
+ if strings.HasPrefix(t, pre) {
+ return true
+ }
+ }
+ if excludeNames[t] != true {
+ //if x.Pos() > startDecl && x.Pos() < endDecl {
+ exported[x.Name] = initRewrite(x.Name + " -> " + t)
+ }
+ }
+
+ case *ast.GenDecl:
+ if x.Tok == token.CONST && x.Lparen > 0 {
+ startDecl = x.Lparen
+ endDecl = x.Rparen
+ // fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl))
+ }
+ }
+ if s != "" {
+ fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s)
+ }
+ return true
+ })
+
+ for _, rw := range exported {
+ astfile = rw(astfile)
+ }
+
+ var buf bytes.Buffer
+
+ printer.Fprint(&buf, fileSet, astfile)
+
+ // Remove package documentation and insert information
+ s := buf.String()
+ ind := strings.Index(buf.String(), "\npackage cpuid")
+ s = s[ind:]
+ s = "// Generated, DO NOT EDIT,\n" +
+ "// but copy it to your own project and rename the package.\n" +
+ "// See more at http://github.com/klauspost/cpuid\n" +
+ s
+
+ outputName := Package + string(os.PathSeparator) + file
+
+ err = ioutil.WriteFile(outputName, []byte(s), 0644)
+ if err != nil {
+ log.Fatalf("writing output: %s", err)
+ }
+ log.Println("Generated", outputName)
+ }
+
+ for _, file := range copyFiles {
+ dst := ""
+ if strings.HasPrefix(file, "cpuid") {
+ dst = Package + string(os.PathSeparator) + file
+ } else {
+ dst = Package + string(os.PathSeparator) + "cpuid_" + file
+ }
+ err := copyFile(file, dst)
+ if err != nil {
+ log.Fatalf("copying file: %s", err)
+ }
+ log.Println("Copied", dst)
+ }
+}
+
+// CopyFile copies a file from src to dst. If src and dst files exist, and are
+// the same, then return success. Copy the file contents from src to dst.
+func copyFile(src, dst string) (err error) {
+ sfi, err := os.Stat(src)
+ if err != nil {
+ return
+ }
+ if !sfi.Mode().IsRegular() {
+ // cannot copy non-regular files (e.g., directories,
+ // symlinks, devices, etc.)
+ return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String())
+ }
+ dfi, err := os.Stat(dst)
+ if err != nil {
+ if !os.IsNotExist(err) {
+ return
+ }
+ } else {
+ if !(dfi.Mode().IsRegular()) {
+ return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String())
+ }
+ if os.SameFile(sfi, dfi) {
+ return
+ }
+ }
+ err = copyFileContents(src, dst)
+ return
+}
+
+// copyFileContents copies the contents of the file named src to the file named
+// by dst. The file will be created if it does not already exist. If the
+// destination file exists, all it's contents will be replaced by the contents
+// of the source file.
+func copyFileContents(src, dst string) (err error) {
+ in, err := os.Open(src)
+ if err != nil {
+ return
+ }
+ defer in.Close()
+ out, err := os.Create(dst)
+ if err != nil {
+ return
+ }
+ defer func() {
+ cerr := out.Close()
+ if err == nil {
+ err = cerr
+ }
+ }()
+ if _, err = io.Copy(out, in); err != nil {
+ return
+ }
+ err = out.Sync()
+ return
+}
+
+type rewrite func(*ast.File) *ast.File
+
+// Mostly copied from gofmt
+func initRewrite(rewriteRule string) rewrite {
+ f := strings.Split(rewriteRule, "->")
+ if len(f) != 2 {
+ fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n")
+ os.Exit(2)
+ }
+ pattern := parseExpr(f[0], "pattern")
+ replace := parseExpr(f[1], "replacement")
+ return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) }
+}
+
+// parseExpr parses s as an expression.
+// It might make sense to expand this to allow statement patterns,
+// but there are problems with preserving formatting and also
+// with what a wildcard for a statement looks like.
+func parseExpr(s, what string) ast.Expr {
+ x, err := parser.ParseExpr(s)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err)
+ os.Exit(2)
+ }
+ return x
+}
+
+// Keep this function for debugging.
+/*
+func dump(msg string, val reflect.Value) {
+ fmt.Printf("%s:\n", msg)
+ ast.Print(fileSet, val.Interface())
+ fmt.Println()
+}
+*/
+
+// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file.
+func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File {
+ cmap := ast.NewCommentMap(fileSet, p, p.Comments)
+ m := make(map[string]reflect.Value)
+ pat := reflect.ValueOf(pattern)
+ repl := reflect.ValueOf(replace)
+
+ var rewriteVal func(val reflect.Value) reflect.Value
+ rewriteVal = func(val reflect.Value) reflect.Value {
+ // don't bother if val is invalid to start with
+ if !val.IsValid() {
+ return reflect.Value{}
+ }
+ for k := range m {
+ delete(m, k)
+ }
+ val = apply(rewriteVal, val)
+ if match(m, pat, val) {
+ val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos()))
+ }
+ return val
+ }
+
+ r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File)
+ r.Comments = cmap.Filter(r).Comments() // recreate comments list
+ return r
+}
+
+// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y.
+func set(x, y reflect.Value) {
+ // don't bother if x cannot be set or y is invalid
+ if !x.CanSet() || !y.IsValid() {
+ return
+ }
+ defer func() {
+ if x := recover(); x != nil {
+ if s, ok := x.(string); ok &&
+ (strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) {
+ // x cannot be set to y - ignore this rewrite
+ return
+ }
+ panic(x)
+ }
+ }()
+ x.Set(y)
+}
+
+// Values/types for special cases.
+var (
+ objectPtrNil = reflect.ValueOf((*ast.Object)(nil))
+ scopePtrNil = reflect.ValueOf((*ast.Scope)(nil))
+
+ identType = reflect.TypeOf((*ast.Ident)(nil))
+ objectPtrType = reflect.TypeOf((*ast.Object)(nil))
+ positionType = reflect.TypeOf(token.NoPos)
+ callExprType = reflect.TypeOf((*ast.CallExpr)(nil))
+ scopePtrType = reflect.TypeOf((*ast.Scope)(nil))
+)
+
+// apply replaces each AST field x in val with f(x), returning val.
+// To avoid extra conversions, f operates on the reflect.Value form.
+func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value {
+ if !val.IsValid() {
+ return reflect.Value{}
+ }
+
+ // *ast.Objects introduce cycles and are likely incorrect after
+ // rewrite; don't follow them but replace with nil instead
+ if val.Type() == objectPtrType {
+ return objectPtrNil
+ }
+
+ // similarly for scopes: they are likely incorrect after a rewrite;
+ // replace them with nil
+ if val.Type() == scopePtrType {
+ return scopePtrNil
+ }
+
+ switch v := reflect.Indirect(val); v.Kind() {
+ case reflect.Slice:
+ for i := 0; i < v.Len(); i++ {
+ e := v.Index(i)
+ set(e, f(e))
+ }
+ case reflect.Struct:
+ for i := 0; i < v.NumField(); i++ {
+ e := v.Field(i)
+ set(e, f(e))
+ }
+ case reflect.Interface:
+ e := v.Elem()
+ set(v, f(e))
+ }
+ return val
+}
+
+func isWildcard(s string) bool {
+ rune, size := utf8.DecodeRuneInString(s)
+ return size == len(s) && unicode.IsLower(rune)
+}
+
+// match returns true if pattern matches val,
+// recording wildcard submatches in m.
+// If m == nil, match checks whether pattern == val.
+func match(m map[string]reflect.Value, pattern, val reflect.Value) bool {
+ // Wildcard matches any expression. If it appears multiple
+ // times in the pattern, it must match the same expression
+ // each time.
+ if m != nil && pattern.IsValid() && pattern.Type() == identType {
+ name := pattern.Interface().(*ast.Ident).Name
+ if isWildcard(name) && val.IsValid() {
+ // wildcards only match valid (non-nil) expressions.
+ if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() {
+ if old, ok := m[name]; ok {
+ return match(nil, old, val)
+ }
+ m[name] = val
+ return true
+ }
+ }
+ }
+
+ // Otherwise, pattern and val must match recursively.
+ if !pattern.IsValid() || !val.IsValid() {
+ return !pattern.IsValid() && !val.IsValid()
+ }
+ if pattern.Type() != val.Type() {
+ return false
+ }
+
+ // Special cases.
+ switch pattern.Type() {
+ case identType:
+ // For identifiers, only the names need to match
+ // (and none of the other *ast.Object information).
+ // This is a common case, handle it all here instead
+ // of recursing down any further via reflection.
+ p := pattern.Interface().(*ast.Ident)
+ v := val.Interface().(*ast.Ident)
+ return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name
+ case objectPtrType, positionType:
+ // object pointers and token positions always match
+ return true
+ case callExprType:
+ // For calls, the Ellipsis fields (token.Position) must
+ // match since that is how f(x) and f(x...) are different.
+ // Check them here but fall through for the remaining fields.
+ p := pattern.Interface().(*ast.CallExpr)
+ v := val.Interface().(*ast.CallExpr)
+ if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() {
+ return false
+ }
+ }
+
+ p := reflect.Indirect(pattern)
+ v := reflect.Indirect(val)
+ if !p.IsValid() || !v.IsValid() {
+ return !p.IsValid() && !v.IsValid()
+ }
+
+ switch p.Kind() {
+ case reflect.Slice:
+ if p.Len() != v.Len() {
+ return false
+ }
+ for i := 0; i < p.Len(); i++ {
+ if !match(m, p.Index(i), v.Index(i)) {
+ return false
+ }
+ }
+ return true
+
+ case reflect.Struct:
+ for i := 0; i < p.NumField(); i++ {
+ if !match(m, p.Field(i), v.Field(i)) {
+ return false
+ }
+ }
+ return true
+
+ case reflect.Interface:
+ return match(m, p.Elem(), v.Elem())
+ }
+
+ // Handle token integers, etc.
+ return p.Interface() == v.Interface()
+}
+
+// subst returns a copy of pattern with values from m substituted in place
+// of wildcards and pos used as the position of tokens from the pattern.
+// if m == nil, subst returns a copy of pattern and doesn't change the line
+// number information.
+func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value {
+ if !pattern.IsValid() {
+ return reflect.Value{}
+ }
+
+ // Wildcard gets replaced with map value.
+ if m != nil && pattern.Type() == identType {
+ name := pattern.Interface().(*ast.Ident).Name
+ if isWildcard(name) {
+ if old, ok := m[name]; ok {
+ return subst(nil, old, reflect.Value{})
+ }
+ }
+ }
+
+ if pos.IsValid() && pattern.Type() == positionType {
+ // use new position only if old position was valid in the first place
+ if old := pattern.Interface().(token.Pos); !old.IsValid() {
+ return pattern
+ }
+ return pos
+ }
+
+ // Otherwise copy.
+ switch p := pattern; p.Kind() {
+ case reflect.Slice:
+ v := reflect.MakeSlice(p.Type(), p.Len(), p.Len())
+ for i := 0; i < p.Len(); i++ {
+ v.Index(i).Set(subst(m, p.Index(i), pos))
+ }
+ return v
+
+ case reflect.Struct:
+ v := reflect.New(p.Type()).Elem()
+ for i := 0; i < p.NumField(); i++ {
+ v.Field(i).Set(subst(m, p.Field(i), pos))
+ }
+ return v
+
+ case reflect.Ptr:
+ v := reflect.New(p.Type()).Elem()
+ if elem := p.Elem(); elem.IsValid() {
+ v.Set(subst(m, elem, pos).Addr())
+ }
+ return v
+
+ case reflect.Interface:
+ v := reflect.New(p.Type()).Elem()
+ if elem := p.Elem(); elem.IsValid() {
+ v.Set(subst(m, elem, pos))
+ }
+ return v
+ }
+
+ return pattern
+}
diff --git a/vendor/github.com/klauspost/crc32/LICENSE b/vendor/github.com/klauspost/crc32/LICENSE
new file mode 100644
index 0000000000..4fd5963e39
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2015 Klaus Post
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md
new file mode 100644
index 0000000000..029625d360
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/README.md
@@ -0,0 +1,87 @@
+# crc32
+CRC32 hash with x64 optimizations
+
+This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
+
+[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
+
+# usage
+
+Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
+
+Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
+
+# changes
+* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
+* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
+
+
+# performance
+
+For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
+
+
+For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
+```
+benchmark old ns/op new ns/op delta
+BenchmarkCrc32KB 99955 10258 -89.74%
+
+benchmark old MB/s new MB/s speedup
+BenchmarkCrc32KB 327.83 3194.20 9.74x
+```
+
+For other tables and "CLMUL" capable machines the performance is the same as the standard library.
+
+Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
+
+```
+Std: Standard Go 1.5 library
+Crc: Indicates IEEE type CRC.
+40B: Size of each slice encoded.
+NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
+Castagnoli: Castagnoli CRC type.
+
+BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s
+BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8)
+BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8)
+
+BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s
+BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8)
+BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm)
+
+BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8)
+BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8)
+BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm)
+
+BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8)
+BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8)
+BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s
+BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm)
+BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8)
+BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s
+BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm)
+BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8)
+BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s
+BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm)
+BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8)
+BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm)
+
+BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s
+BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm)
+BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8)
+BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm)
+```
+
+The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
+
+However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
+
+# license
+
+Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go
new file mode 100644
index 0000000000..8aa91b17e9
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32.go
@@ -0,0 +1,207 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
+// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
+// information.
+//
+// Polynomials are represented in LSB-first form also known as reversed representation.
+//
+// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
+// for information.
+package crc32
+
+import (
+ "hash"
+ "sync"
+)
+
+// The size of a CRC-32 checksum in bytes.
+const Size = 4
+
+// Predefined polynomials.
+const (
+ // IEEE is by far and away the most common CRC-32 polynomial.
+ // Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
+ IEEE = 0xedb88320
+
+ // Castagnoli's polynomial, used in iSCSI.
+ // Has better error detection characteristics than IEEE.
+ // http://dx.doi.org/10.1109/26.231911
+ Castagnoli = 0x82f63b78
+
+ // Koopman's polynomial.
+ // Also has better error detection characteristics than IEEE.
+ // http://dx.doi.org/10.1109/DSN.2002.1028931
+ Koopman = 0xeb31d82e
+)
+
+// Table is a 256-word table representing the polynomial for efficient processing.
+type Table [256]uint32
+
+// This file makes use of functions implemented in architecture-specific files.
+// The interface that they implement is as follows:
+//
+// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
+// // algorithm is available.
+// archAvailableIEEE() bool
+//
+// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
+// // It can only be called if archAvailableIEEE() returns true.
+// archInitIEEE()
+//
+// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
+// // archInitIEEE() was previously called.
+// archUpdateIEEE(crc uint32, p []byte) uint32
+//
+// // archAvailableCastagnoli reports whether an architecture-specific
+// // CRC32-C algorithm is available.
+// archAvailableCastagnoli() bool
+//
+// // archInitCastagnoli initializes the architecture-specific CRC32-C
+// // algorithm. It can only be called if archAvailableCastagnoli() returns
+// // true.
+// archInitCastagnoli()
+//
+// // archUpdateCastagnoli updates the given CRC32-C. It can only be called
+// // if archInitCastagnoli() was previously called.
+// archUpdateCastagnoli(crc uint32, p []byte) uint32
+
+// castagnoliTable points to a lazily initialized Table for the Castagnoli
+// polynomial. MakeTable will always return this value when asked to make a
+// Castagnoli table so we can compare against it to find when the caller is
+// using this polynomial.
+var castagnoliTable *Table
+var castagnoliTable8 *slicing8Table
+var castagnoliArchImpl bool
+var updateCastagnoli func(crc uint32, p []byte) uint32
+var castagnoliOnce sync.Once
+
+func castagnoliInit() {
+ castagnoliTable = simpleMakeTable(Castagnoli)
+ castagnoliArchImpl = archAvailableCastagnoli()
+
+ if castagnoliArchImpl {
+ archInitCastagnoli()
+ updateCastagnoli = archUpdateCastagnoli
+ } else {
+ // Initialize the slicing-by-8 table.
+ castagnoliTable8 = slicingMakeTable(Castagnoli)
+ updateCastagnoli = func(crc uint32, p []byte) uint32 {
+ return slicingUpdate(crc, castagnoliTable8, p)
+ }
+ }
+}
+
+// IEEETable is the table for the IEEE polynomial.
+var IEEETable = simpleMakeTable(IEEE)
+
+// ieeeTable8 is the slicing8Table for IEEE
+var ieeeTable8 *slicing8Table
+var ieeeArchImpl bool
+var updateIEEE func(crc uint32, p []byte) uint32
+var ieeeOnce sync.Once
+
+func ieeeInit() {
+ ieeeArchImpl = archAvailableIEEE()
+
+ if ieeeArchImpl {
+ archInitIEEE()
+ updateIEEE = archUpdateIEEE
+ } else {
+ // Initialize the slicing-by-8 table.
+ ieeeTable8 = slicingMakeTable(IEEE)
+ updateIEEE = func(crc uint32, p []byte) uint32 {
+ return slicingUpdate(crc, ieeeTable8, p)
+ }
+ }
+}
+
+// MakeTable returns a Table constructed from the specified polynomial.
+// The contents of this Table must not be modified.
+func MakeTable(poly uint32) *Table {
+ switch poly {
+ case IEEE:
+ ieeeOnce.Do(ieeeInit)
+ return IEEETable
+ case Castagnoli:
+ castagnoliOnce.Do(castagnoliInit)
+ return castagnoliTable
+ }
+ return simpleMakeTable(poly)
+}
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+ crc uint32
+ tab *Table
+}
+
+// New creates a new hash.Hash32 computing the CRC-32 checksum
+// using the polynomial represented by the Table.
+// Its Sum method will lay the value out in big-endian byte order.
+func New(tab *Table) hash.Hash32 {
+ if tab == IEEETable {
+ ieeeOnce.Do(ieeeInit)
+ }
+ return &digest{0, tab}
+}
+
+// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
+// using the IEEE polynomial.
+// Its Sum method will lay the value out in big-endian byte order.
+func NewIEEE() hash.Hash32 { return New(IEEETable) }
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return 1 }
+
+func (d *digest) Reset() { d.crc = 0 }
+
+// Update returns the result of adding the bytes in p to the crc.
+func Update(crc uint32, tab *Table, p []byte) uint32 {
+ switch tab {
+ case castagnoliTable:
+ return updateCastagnoli(crc, p)
+ case IEEETable:
+ // Unfortunately, because IEEETable is exported, IEEE may be used without a
+ // call to MakeTable. We have to make sure it gets initialized in that case.
+ ieeeOnce.Do(ieeeInit)
+ return updateIEEE(crc, p)
+ default:
+ return simpleUpdate(crc, tab, p)
+ }
+}
+
+func (d *digest) Write(p []byte) (n int, err error) {
+ switch d.tab {
+ case castagnoliTable:
+ d.crc = updateCastagnoli(d.crc, p)
+ case IEEETable:
+ // We only create digest objects through New() which takes care of
+ // initialization in this case.
+ d.crc = updateIEEE(d.crc, p)
+ default:
+ d.crc = simpleUpdate(d.crc, d.tab, p)
+ }
+ return len(p), nil
+}
+
+func (d *digest) Sum32() uint32 { return d.crc }
+
+func (d *digest) Sum(in []byte) []byte {
+ s := d.Sum32()
+ return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
+}
+
+// Checksum returns the CRC-32 checksum of data
+// using the polynomial represented by the Table.
+func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
+
+// ChecksumIEEE returns the CRC-32 checksum of data
+// using the IEEE polynomial.
+func ChecksumIEEE(data []byte) uint32 {
+ ieeeOnce.Do(ieeeInit)
+ return updateIEEE(0, data)
+}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go
new file mode 100644
index 0000000000..af2a0b844b
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64.go
@@ -0,0 +1,230 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
+// description of the interface that each architecture-specific file
+// implements.
+
+package crc32
+
+import "unsafe"
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// and IEEE CRC.
+
+// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
+// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
+func haveSSE41() bool
+func haveSSE42() bool
+func haveCLMUL() bool
+
+// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42Triple(
+ crcA, crcB, crcC uint32,
+ a, b, c []byte,
+ rounds uint32,
+) (retA uint32, retB uint32, retC uint32)
+
+// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
+// instruction as well as SSE 4.1.
+//go:noescape
+func ieeeCLMUL(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+var useFastIEEE = haveCLMUL() && haveSSE41()
+
+const castagnoliK1 = 168
+const castagnoliK2 = 1344
+
+type sse42Table [4]Table
+
+var castagnoliSSE42TableK1 *sse42Table
+var castagnoliSSE42TableK2 *sse42Table
+
+func archAvailableCastagnoli() bool {
+ return sse42
+}
+
+func archInitCastagnoli() {
+ if !sse42 {
+ panic("arch-specific Castagnoli not available")
+ }
+ castagnoliSSE42TableK1 = new(sse42Table)
+ castagnoliSSE42TableK2 = new(sse42Table)
+ // See description in updateCastagnoli.
+ // t[0][i] = CRC(i000, O)
+ // t[1][i] = CRC(0i00, O)
+ // t[2][i] = CRC(00i0, O)
+ // t[3][i] = CRC(000i, O)
+ // where O is a sequence of K zeros.
+ var tmp [castagnoliK2]byte
+ for b := 0; b < 4; b++ {
+ for i := 0; i < 256; i++ {
+ val := uint32(i) << uint32(b*8)
+ castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
+ castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
+ }
+ }
+}
+
+// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
+// table given) with the given initial crc value. This corresponds to
+// CRC(crc, O) in the description in updateCastagnoli.
+func castagnoliShift(table *sse42Table, crc uint32) uint32 {
+ return table[3][crc>>24] ^
+ table[2][(crc>>16)&0xFF] ^
+ table[1][(crc>>8)&0xFF] ^
+ table[0][crc&0xFF]
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+ if !sse42 {
+ panic("not available")
+ }
+
+ // This method is inspired from the algorithm in Intel's white paper:
+ // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ // The same strategy of splitting the buffer in three is used but the
+ // combining calculation is different; the complete derivation is explained
+ // below.
+ //
+ // -- The basic idea --
+ //
+ // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
+ // time. In recent Intel architectures the instruction takes 3 cycles;
+ // however the processor can pipeline up to three instructions if they
+ // don't depend on each other.
+ //
+ // Roughly this means that we can process three buffers in about the same
+ // time we can process one buffer.
+ //
+ // The idea is then to split the buffer in three, CRC the three pieces
+ // separately and then combine the results.
+ //
+ // Combining the results requires precomputed tables, so we must choose a
+ // fixed buffer length to optimize. The longer the length, the faster; but
+ // only buffers longer than this length will use the optimization. We choose
+ // two cutoffs and compute tables for both:
+ // - one around 512: 168*3=504
+ // - one around 4KB: 1344*3=4032
+ //
+ // -- The nitty gritty --
+ //
+ // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
+ // initial non-inverted CRC I). This function has the following properties:
+ // (a) CRC(I, AB) = CRC(CRC(I, A), B)
+ // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
+ //
+ // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
+ // K bytes each, where K is a fixed constant. Let O be the sequence of K zero
+ // bytes.
+ //
+ // CRC(I, ABC) = CRC(I, ABO xor C)
+ // = CRC(I, ABO) xor CRC(0, C)
+ // = CRC(CRC(I, AB), O) xor CRC(0, C)
+ // = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
+ // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
+ // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
+ //
+ // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
+ // and CRC(0, C) efficiently. We just need to find a way to quickly compute
+ // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
+ // values; since we can't have a 32-bit table, we break it up into four
+ // 8-bit tables:
+ //
+ // CRC(uvwx, O) = CRC(u000, O) xor
+ // CRC(0v00, O) xor
+ // CRC(00w0, O) xor
+ // CRC(000x, O)
+ //
+ // We can compute tables corresponding to the four terms for all 8-bit
+ // values.
+
+ crc = ^crc
+
+ // If a buffer is long enough to use the optimization, process the first few
+ // bytes to align the buffer to an 8 byte boundary (if necessary).
+ if len(p) >= castagnoliK1*3 {
+ delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
+ if delta != 0 {
+ delta = 8 - delta
+ crc = castagnoliSSE42(crc, p[:delta])
+ p = p[delta:]
+ }
+ }
+
+ // Process 3*K2 at a time.
+ for len(p) >= castagnoliK2*3 {
+ // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
+ crcA, crcB, crcC := castagnoliSSE42Triple(
+ crc, 0, 0,
+ p, p[castagnoliK2:], p[castagnoliK2*2:],
+ castagnoliK2/24)
+
+ // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
+ crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
+ // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
+ crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
+ p = p[castagnoliK2*3:]
+ }
+
+ // Process 3*K1 at a time.
+ for len(p) >= castagnoliK1*3 {
+ // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
+ crcA, crcB, crcC := castagnoliSSE42Triple(
+ crc, 0, 0,
+ p, p[castagnoliK1:], p[castagnoliK1*2:],
+ castagnoliK1/24)
+
+ // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
+ crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
+ // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
+ crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
+ p = p[castagnoliK1*3:]
+ }
+
+ // Use the simple implementation for what's left.
+ crc = castagnoliSSE42(crc, p)
+ return ^crc
+}
+
+func archAvailableIEEE() bool {
+ return useFastIEEE
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+ if !useFastIEEE {
+ panic("not available")
+ }
+ // We still use slicing-by-8 for small buffers.
+ archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+ if !useFastIEEE {
+ panic("not available")
+ }
+
+ if len(p) >= 64 {
+ left := len(p) & 15
+ do := len(p) - left
+ crc = ^ieeeCLMUL(^crc, p[:do])
+ p = p[do:]
+ }
+ if len(p) == 0 {
+ return crc
+ }
+ return slicingUpdate(crc, archIeeeTable8, p)
+}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s
new file mode 100644
index 0000000000..e8a7941ce7
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64.s
@@ -0,0 +1,319 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
+//
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+ MOVL crc+0(FP), AX // CRC value
+ MOVQ p+8(FP), SI // data pointer
+ MOVQ p_len+16(FP), CX // len(p)
+
+ // If there are fewer than 8 bytes to process, skip alignment.
+ CMPQ CX, $8
+ JL less_than_8
+
+ MOVQ SI, BX
+ ANDQ $7, BX
+ JZ aligned
+
+ // Process the first few bytes to 8-byte align the input.
+
+ // BX = 8 - BX. We need to process this many bytes to align.
+ SUBQ $1, BX
+ XORQ $7, BX
+
+ BTQ $0, BX
+ JNC align_2
+
+ CRC32B (SI), AX
+ DECQ CX
+ INCQ SI
+
+align_2:
+ BTQ $1, BX
+ JNC align_4
+
+ // CRC32W (SI), AX
+ BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+ SUBQ $2, CX
+ ADDQ $2, SI
+
+align_4:
+ BTQ $2, BX
+ JNC aligned
+
+ // CRC32L (SI), AX
+ BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+ SUBQ $4, CX
+ ADDQ $4, SI
+
+aligned:
+ // The input is now 8-byte aligned and we can process 8-byte chunks.
+ CMPQ CX, $8
+ JL less_than_8
+
+ CRC32Q (SI), AX
+ ADDQ $8, SI
+ SUBQ $8, CX
+ JMP aligned
+
+less_than_8:
+ // We may have some bytes left over; process 4 bytes, then 2, then 1.
+ BTQ $2, CX
+ JNC less_than_4
+
+ // CRC32L (SI), AX
+ BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+ ADDQ $4, SI
+
+less_than_4:
+ BTQ $1, CX
+ JNC less_than_2
+
+ // CRC32W (SI), AX
+ BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+ ADDQ $2, SI
+
+less_than_2:
+ BTQ $0, CX
+ JNC done
+
+ CRC32B (SI), AX
+
+done:
+ MOVL AX, ret+32(FP)
+ RET
+
+// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
+// bytes from each buffer.
+//
+// func castagnoliSSE42Triple(
+// crc1, crc2, crc3 uint32,
+// a, b, c []byte,
+// rounds uint32,
+// ) (retA uint32, retB uint32, retC uint32)
+TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
+ MOVL crcA+0(FP), AX
+ MOVL crcB+4(FP), CX
+ MOVL crcC+8(FP), DX
+
+ MOVQ a+16(FP), R8 // data pointer
+ MOVQ b+40(FP), R9 // data pointer
+ MOVQ c+64(FP), R10 // data pointer
+
+ MOVL rounds+88(FP), R11
+
+loop:
+ CRC32Q (R8), AX
+ CRC32Q (R9), CX
+ CRC32Q (R10), DX
+
+ CRC32Q 8(R8), AX
+ CRC32Q 8(R9), CX
+ CRC32Q 8(R10), DX
+
+ CRC32Q 16(R8), AX
+ CRC32Q 16(R9), CX
+ CRC32Q 16(R10), DX
+
+ ADDQ $24, R8
+ ADDQ $24, R9
+ ADDQ $24, R10
+
+ DECQ R11
+ JNZ loop
+
+ MOVL AX, retA+96(FP)
+ MOVL CX, retB+100(FP)
+ MOVL DX, retC+104(FP)
+ RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $20, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// func haveCLMUL() bool
+TEXT ·haveCLMUL(SB), NOSPLIT, $0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $1, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// func haveSSE41() bool
+TEXT ·haveSSE41(SB), NOSPLIT, $0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $19, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// CRC32 polynomial data
+//
+// These constants are lifted from the
+// Linux kernel, since they avoid the costly
+// PSHUFB 16 byte reversal proposed in the
+// original Intel paper.
+DATA r2r1kp<>+0(SB)/8, $0x154442bd4
+DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
+DATA r4r3kp<>+0(SB)/8, $0x1751997d0
+DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
+DATA rupolykp<>+0(SB)/8, $0x1db710641
+DATA rupolykp<>+8(SB)/8, $0x1f7011641
+DATA r5kp<>+0(SB)/8, $0x163cd6124
+
+GLOBL r2r1kp<>(SB), RODATA, $16
+GLOBL r4r3kp<>(SB), RODATA, $16
+GLOBL rupolykp<>(SB), RODATA, $16
+GLOBL r5kp<>(SB), RODATA, $8
+
+// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+// len(p) must be at least 64, and must be a multiple of 16.
+
+// func ieeeCLMUL(crc uint32, p []byte) uint32
+TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
+ MOVL crc+0(FP), X0 // Initial CRC value
+ MOVQ p+8(FP), SI // data pointer
+ MOVQ p_len+16(FP), CX // len(p)
+
+ MOVOU (SI), X1
+ MOVOU 16(SI), X2
+ MOVOU 32(SI), X3
+ MOVOU 48(SI), X4
+ PXOR X0, X1
+ ADDQ $64, SI // buf+=64
+ SUBQ $64, CX // len-=64
+ CMPQ CX, $64 // Less than 64 bytes left
+ JB remain64
+
+ MOVOA r2r1kp<>+0(SB), X0
+
+loopback64:
+ MOVOA X1, X5
+ MOVOA X2, X6
+ MOVOA X3, X7
+ MOVOA X4, X8
+
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0, X0, X2
+ PCLMULQDQ $0, X0, X3
+ PCLMULQDQ $0, X0, X4
+
+ // Load next early
+ MOVOU (SI), X11
+ MOVOU 16(SI), X12
+ MOVOU 32(SI), X13
+ MOVOU 48(SI), X14
+
+ PCLMULQDQ $0x11, X0, X5
+ PCLMULQDQ $0x11, X0, X6
+ PCLMULQDQ $0x11, X0, X7
+ PCLMULQDQ $0x11, X0, X8
+
+ PXOR X5, X1
+ PXOR X6, X2
+ PXOR X7, X3
+ PXOR X8, X4
+
+ PXOR X11, X1
+ PXOR X12, X2
+ PXOR X13, X3
+ PXOR X14, X4
+
+ ADDQ $0x40, DI
+ ADDQ $64, SI // buf+=64
+ SUBQ $64, CX // len-=64
+ CMPQ CX, $64 // Less than 64 bytes left?
+ JGE loopback64
+
+ // Fold result into a single register (X1)
+remain64:
+ MOVOA r4r3kp<>+0(SB), X0
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X2, X1
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X3, X1
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X4, X1
+
+ // If there is less than 16 bytes left we are done
+ CMPQ CX, $16
+ JB finish
+
+ // Encode 16 bytes
+remain16:
+ MOVOU (SI), X10
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X10, X1
+ SUBQ $16, CX
+ ADDQ $16, SI
+ CMPQ CX, $16
+ JGE remain16
+
+finish:
+ // Fold final result into 32 bits and return it
+ PCMPEQB X3, X3
+ PCLMULQDQ $1, X1, X0
+ PSRLDQ $8, X1
+ PXOR X0, X1
+
+ MOVOA X1, X2
+ MOVQ r5kp<>+0(SB), X0
+
+ // Creates 32 bit mask. Note that we don't care about upper half.
+ PSRLQ $32, X3
+
+ PSRLDQ $4, X2
+ PAND X3, X1
+ PCLMULQDQ $0, X0, X1
+ PXOR X2, X1
+
+ MOVOA rupolykp<>+0(SB), X0
+
+ MOVOA X1, X2
+ PAND X3, X1
+ PCLMULQDQ $0x10, X0, X1
+ PAND X3, X1
+ PCLMULQDQ $0, X0, X1
+ PXOR X2, X1
+
+ // PEXTRD $1, X1, AX (SSE 4.1)
+ BYTE $0x66; BYTE $0x0f; BYTE $0x3a
+ BYTE $0x16; BYTE $0xc8; BYTE $0x01
+ MOVL AX, ret+32(FP)
+
+ RET
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
new file mode 100644
index 0000000000..3222b06a5a
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@@ -0,0 +1,43 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine,!gccgo
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// CRC.
+
+// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
+// support.
+func haveSSE42() bool
+
+// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
+// instruction.
+//go:noescape
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+
+func archAvailableCastagnoli() bool {
+ return sse42
+}
+
+func archInitCastagnoli() {
+ if !sse42 {
+ panic("not available")
+ }
+ // No initialization necessary.
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+ if !sse42 {
+ panic("not available")
+ }
+ return castagnoliSSE42(crc, p)
+}
+
+func archAvailableIEEE() bool { return false }
+func archInitIEEE() { panic("not available") }
+func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
new file mode 100644
index 0000000000..a578d685cc
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@@ -0,0 +1,67 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build gc
+
+#define NOSPLIT 4
+#define RODATA 8
+
+// func castagnoliSSE42(crc uint32, p []byte) uint32
+TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
+ MOVL crc+0(FP), AX // CRC value
+ MOVL p+4(FP), SI // data pointer
+ MOVL p_len+8(FP), CX // len(p)
+
+ NOTL AX
+
+ // If there's less than 8 bytes to process, we do it byte-by-byte.
+ CMPQ CX, $8
+ JL cleanup
+
+ // Process individual bytes until the input is 8-byte aligned.
+startup:
+ MOVQ SI, BX
+ ANDQ $7, BX
+ JZ aligned
+
+ CRC32B (SI), AX
+ DECQ CX
+ INCQ SI
+ JMP startup
+
+aligned:
+ // The input is now 8-byte aligned and we can process 8-byte chunks.
+ CMPQ CX, $8
+ JL cleanup
+
+ CRC32Q (SI), AX
+ ADDQ $8, SI
+ SUBQ $8, CX
+ JMP aligned
+
+cleanup:
+ // We may have some bytes left over that we process one at a time.
+ CMPQ CX, $0
+ JE done
+
+ CRC32B (SI), AX
+ INCQ SI
+ DECQ CX
+ JMP cleanup
+
+done:
+ NOTL AX
+ MOVL AX, ret+16(FP)
+ RET
+
+// func haveSSE42() bool
+TEXT ·haveSSE42(SB), NOSPLIT, $0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $20, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go
new file mode 100644
index 0000000000..abacbb663d
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_generic.go
@@ -0,0 +1,89 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains CRC32 algorithms that are not specific to any architecture
+// and don't use hardware acceleration.
+//
+// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
+//
+// The slicing-by-8 algorithm is a faster implementation that uses a bigger
+// table (8*256*4 bytes).
+
+package crc32
+
+// simpleMakeTable allocates and constructs a Table for the specified
+// polynomial. The table is suitable for use with the simple algorithm
+// (simpleUpdate).
+func simpleMakeTable(poly uint32) *Table {
+ t := new(Table)
+ simplePopulateTable(poly, t)
+ return t
+}
+
+// simplePopulateTable constructs a Table for the specified polynomial, suitable
+// for use with simpleUpdate.
+func simplePopulateTable(poly uint32, t *Table) {
+ for i := 0; i < 256; i++ {
+ crc := uint32(i)
+ for j := 0; j < 8; j++ {
+ if crc&1 == 1 {
+ crc = (crc >> 1) ^ poly
+ } else {
+ crc >>= 1
+ }
+ }
+ t[i] = crc
+ }
+}
+
+// simpleUpdate uses the simple algorithm to update the CRC, given a table that
+// was previously computed using simpleMakeTable.
+func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
+ crc = ^crc
+ for _, v := range p {
+ crc = tab[byte(crc)^v] ^ (crc >> 8)
+ }
+ return ^crc
+}
+
+// Use slicing-by-8 when payload >= this value.
+const slicing8Cutoff = 16
+
+// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
+type slicing8Table [8]Table
+
+// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
+// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
+func slicingMakeTable(poly uint32) *slicing8Table {
+ t := new(slicing8Table)
+ simplePopulateTable(poly, &t[0])
+ for i := 0; i < 256; i++ {
+ crc := t[0][i]
+ for j := 1; j < 8; j++ {
+ crc = t[0][crc&0xFF] ^ (crc >> 8)
+ t[j][i] = crc
+ }
+ }
+ return t
+}
+
+// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
+// table that was previously computed using slicingMakeTable.
+func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
+ if len(p) >= slicing8Cutoff {
+ crc = ^crc
+ for len(p) > 8 {
+ crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+ crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
+ tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
+ tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
+ p = p[8:]
+ }
+ crc = ^crc
+ }
+ if len(p) == 0 {
+ return crc
+ }
+ return simpleUpdate(crc, &tab[0], p)
+}
diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go
new file mode 100644
index 0000000000..cc960764bc
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_otherarch.go
@@ -0,0 +1,15 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!amd64p32,!s390x
+
+package crc32
+
+func archAvailableIEEE() bool { return false }
+func archInitIEEE() { panic("not available") }
+func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
+
+func archAvailableCastagnoli() bool { return false }
+func archInitCastagnoli() { panic("not available") }
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go
new file mode 100644
index 0000000000..ce96f03281
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_s390x.go
@@ -0,0 +1,91 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+
+package crc32
+
+const (
+ vxMinLen = 64
+ vxAlignMask = 15 // align to 16 bytes
+)
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
+
+// vectorizedCastagnoli implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedCastagnoli(crc uint32, p []byte) uint32
+
+// vectorizedIEEE implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedIEEE(crc uint32, p []byte) uint32
+
+func archAvailableCastagnoli() bool {
+ return hasVX
+}
+
+var archCastagnoliTable8 *slicing8Table
+
+func archInitCastagnoli() {
+ if !hasVX {
+ panic("not available")
+ }
+ // We still use slicing-by-8 for small buffers.
+ archCastagnoliTable8 = slicingMakeTable(Castagnoli)
+}
+
+// archUpdateCastagnoli calculates the checksum of p using
+// vectorizedCastagnoli.
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+ if !hasVX {
+ panic("not available")
+ }
+ // Use vectorized function if data length is above threshold.
+ if len(p) >= vxMinLen {
+ aligned := len(p) & ^vxAlignMask
+ crc = vectorizedCastagnoli(crc, p[:aligned])
+ p = p[aligned:]
+ }
+ if len(p) == 0 {
+ return crc
+ }
+ return slicingUpdate(crc, archCastagnoliTable8, p)
+}
+
+func archAvailableIEEE() bool {
+ return hasVX
+}
+
+var archIeeeTable8 *slicing8Table
+
+func archInitIEEE() {
+ if !hasVX {
+ panic("not available")
+ }
+ // We still use slicing-by-8 for small buffers.
+ archIeeeTable8 = slicingMakeTable(IEEE)
+}
+
+// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+ if !hasVX {
+ panic("not available")
+ }
+ // Use vectorized function if data length is above threshold.
+ if len(p) >= vxMinLen {
+ aligned := len(p) & ^vxAlignMask
+ crc = vectorizedIEEE(crc, p[:aligned])
+ p = p[aligned:]
+ }
+ if len(p) == 0 {
+ return crc
+ }
+ return slicingUpdate(crc, archIeeeTable8, p)
+}
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s
new file mode 100644
index 0000000000..e980ca29d6
--- /dev/null
+++ b/vendor/github.com/klauspost/crc32/crc32_s390x.s
@@ -0,0 +1,249 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+
+#include "textflag.h"
+
+// Vector register range containing CRC-32 constants
+
+#define CONST_PERM_LE2BE V9
+#define CONST_R2R1 V10
+#define CONST_R4R3 V11
+#define CONST_R5 V12
+#define CONST_RU_POLY V13
+#define CONST_CRC_POLY V14
+
+// The CRC-32 constant block contains reduction constants to fold and
+// process particular chunks of the input data stream in parallel.
+//
+// Note that the constant definitions below are extended in order to compute
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+// The rightmost doubleword can be 0 to prevent contribution to the result or
+// can be multiplied by 1 to perform an XOR without the need for a separate
+// VECTOR EXCLUSIVE OR instruction.
+//
+// The polynomials used are bit-reflected:
+//
+// IEEE: P'(x) = 0x0edb88320
+// Castagnoli: P'(x) = 0x082f63b78
+
+// IEEE polynomial constants
+DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
+DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
+DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
+DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
+DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
+DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
+DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
+DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
+DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
+
+GLOBL ·crcleconskp(SB), RODATA, $144
+
+// Castagonli Polynomial constants
+DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
+DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
+DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
+DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
+DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
+DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
+DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
+DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
+DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
+DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
+
+GLOBL ·crccleconskp(SB), RODATA, $144
+
+// func hasVectorFacility() bool
+TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
+ MOVD $x-24(SP), R1
+ XC $24, 0(R1), 0(R1) // clear the storage
+ MOVD $2, R0 // R0 is the number of double words stored -1
+ WORD $0xB2B01000 // STFLE 0(R1)
+ XOR R0, R0 // reset the value of R0
+ MOVBZ z-8(SP), R1
+ AND $0x40, R1
+ BEQ novector
+
+vectorinstalled:
+ // check if the vector instruction has been enabled
+ VLEIB $0, $0xF, V16
+ VLGVB $0, V16, R1
+ CMPBNE R1, $0xF, novector
+ MOVB $1, ret+0(FP) // have vx
+ RET
+
+novector:
+ MOVB $0, ret+0(FP) // no vx
+ RET
+
+// The CRC-32 function(s) use these calling conventions:
+//
+// Parameters:
+//
+// R2: Initial CRC value, typically ~0; and final CRC (return) value.
+// R3: Input buffer pointer, performance might be improved if the
+// buffer is on a doubleword boundary.
+// R4: Length of the buffer, must be 64 bytes or greater.
+//
+// Register usage:
+//
+// R5: CRC-32 constant pool base pointer.
+// V0: Initial CRC value and intermediate constants and results.
+// V1..V4: Data for CRC computation.
+// V5..V8: Next data chunks that are fetched from the input buffer.
+//
+// V9..V14: CRC-32 constants.
+
+// func vectorizedIEEE(crc uint32, p []byte) uint32
+TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
+ MOVD p+8(FP), R3 // data pointer
+ MOVD p_len+16(FP), R4 // len(p)
+
+ MOVD $·crcleconskp(SB), R5
+ BR vectorizedBody<>(SB)
+
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
+TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
+ MOVWZ crc+0(FP), R2 // R2 stores the CRC value
+ MOVD p+8(FP), R3 // data pointer
+ MOVD p_len+16(FP), R4 // len(p)
+
+ // R5: crc-32 constant pool base pointer, constant is used to reduce crc
+ MOVD $·crccleconskp(SB), R5
+ BR vectorizedBody<>(SB)
+
+TEXT vectorizedBody<>(SB), NOSPLIT, $0
+ XOR $0xffffffff, R2 // NOTW R2
+ VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
+
+ // Load the initial CRC value into the rightmost word of V0
+ VZERO V0
+ VLVGF $3, R2, V0
+
+ // Crash if the input size is less than 64-bytes.
+ CMP R4, $64
+ BLT crash
+
+ // Load a 64-byte data chunk and XOR with CRC
+ VLM 0(R3), V1, V4 // 64-bytes into V1..V4
+
+ // Reflect the data if the CRC operation is in the bit-reflected domain
+ VPERM V1, V1, CONST_PERM_LE2BE, V1
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
+ VPERM V3, V3, CONST_PERM_LE2BE, V3
+ VPERM V4, V4, CONST_PERM_LE2BE, V4
+
+ VX V0, V1, V1 // V1 ^= CRC
+ ADD $64, R3 // BUF = BUF + 64
+ ADD $(-64), R4
+
+ // Check remaining buffer size and jump to proper folding method
+ CMP R4, $64
+ BLT less_than_64bytes
+
+fold_64bytes_loop:
+ // Load the next 64-byte data chunk into V5 to V8
+ VLM 0(R3), V5, V8
+ VPERM V5, V5, CONST_PERM_LE2BE, V5
+ VPERM V6, V6, CONST_PERM_LE2BE, V6
+ VPERM V7, V7, CONST_PERM_LE2BE, V7
+ VPERM V8, V8, CONST_PERM_LE2BE, V8
+
+ // Perform a GF(2) multiplication of the doublewords in V1 with
+ // the reduction constants in V0. The intermediate result is
+ // then folded (accumulated) with the next data chunk in V5 and
+ // stored in V1. Repeat this step for the register contents
+ // in V2, V3, and V4 respectively.
+
+ VGFMAG CONST_R2R1, V1, V5, V1
+ VGFMAG CONST_R2R1, V2, V6, V2
+ VGFMAG CONST_R2R1, V3, V7, V3
+ VGFMAG CONST_R2R1, V4, V8, V4
+
+ // Adjust buffer pointer and length for next loop
+ ADD $64, R3 // BUF = BUF + 64
+ ADD $(-64), R4 // LEN = LEN - 64
+
+ CMP R4, $64
+ BGE fold_64bytes_loop
+
+less_than_64bytes:
+ // Fold V1 to V4 into a single 128-bit value in V1
+ VGFMAG CONST_R4R3, V1, V2, V1
+ VGFMAG CONST_R4R3, V1, V3, V1
+ VGFMAG CONST_R4R3, V1, V4, V1
+
+ // Check whether to continue with 64-bit folding
+ CMP R4, $16
+ BLT final_fold
+
+fold_16bytes_loop:
+ VL 0(R3), V2 // Load next data chunk
+ VPERM V2, V2, CONST_PERM_LE2BE, V2
+
+ VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
+
+ // Adjust buffer pointer and size for folding next data chunk
+ ADD $16, R3
+ ADD $-16, R4
+
+ // Process remaining data chunks
+ CMP R4, $16
+ BGE fold_16bytes_loop
+
+final_fold:
+ VLEIB $7, $0x40, V9
+ VSRLB V9, CONST_R4R3, V0
+ VLEIG $0, $1, V0
+
+ VGFMG V0, V1, V1
+
+ VLEIB $7, $0x20, V9 // Shift by words
+ VSRLB V9, V1, V2 // Store remaining bits in V2
+ VUPLLF V1, V1 // Split rightmost doubleword
+ VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
+
+ // The input values to the Barret reduction are the degree-63 polynomial
+ // in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ // constant u. The Barret reduction result is the CRC value of R(x) mod
+ // P(x).
+ //
+ // The Barret reduction algorithm is defined as:
+ //
+ // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ // 3. C(x) = R(x) XOR T2(x) mod x^32
+ //
+ // Note: To compensate the division by x^32, use the vector unpack
+ // instruction to move the leftmost word into the leftmost doubleword
+ // of the vector register. The rightmost doubleword is multiplied
+ // with zero to not contribute to the intermedate results.
+
+ // T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ VUPLLF V1, V2
+ VGFMG CONST_RU_POLY, V2, V2
+
+ // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ // V2 and XOR the intermediate result, T2(x), with the value in V1.
+ // The final result is in the rightmost word of V2.
+
+ VUPLLF V2, V2
+ VGFMAG CONST_CRC_POLY, V2, V1, V2
+
+done:
+ VLGVF $2, V2, R2
+ XOR $0xffffffff, R2 // NOTW R2
+ MOVWZ R2, ret + 32(FP)
+ RET
+
+crash:
+ MOVD $0, (R0) // input size is less than 64-bytes