diff options
author | Lunny Xiao <xiaolunwen@gmail.com> | 2019-11-18 13:18:33 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-11-18 13:18:33 +0800 |
commit | 9ff63126274b0df6e035541eafd48970c402e61e (patch) | |
tree | a2ebe40b70d1cdd4ca9e328ad21909b7a0baff10 /vendor/github.com/klauspost | |
parent | ba4e8f221bea0ab40a27da03c7fe3f0f78f6b790 (diff) | |
download | gitea-9ff63126274b0df6e035541eafd48970c402e61e.tar.gz gitea-9ff63126274b0df6e035541eafd48970c402e61e.zip |
Move modules/gzip to gitea.com/macaron/gzip (#9058)
* Move modules/gzip to gitea.com/macaron/gzip
* Fix vendor
Diffstat (limited to 'vendor/github.com/klauspost')
45 files changed, 2901 insertions, 5007 deletions
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE index 7448756763..1eb75ef68e 100644 --- a/vendor/github.com/klauspost/compress/LICENSE +++ b/vendor/github.com/klauspost/compress/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2012 The Go Authors. All rights reserved. +Copyright (c) 2019 Klaus Post. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/vendor/github.com/klauspost/compress/flate/copy.go b/vendor/github.com/klauspost/compress/flate/copy.go deleted file mode 100644 index a3200a8f49..0000000000 --- a/vendor/github.com/klauspost/compress/flate/copy.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package flate - -// forwardCopy is like the built-in copy function except that it always goes -// forward from the start, even if the dst and src overlap. -// It is equivalent to: -// for i := 0; i < n; i++ { -// mem[dst+i] = mem[src+i] -// } -func forwardCopy(mem []byte, dst, src, n int) { - if dst <= src { - copy(mem[dst:dst+n], mem[src:src+n]) - return - } - for { - if dst >= src+n { - copy(mem[dst:dst+n], mem[src:src+n]) - return - } - // There is some forward overlap. The destination - // will be filled with a repeated pattern of mem[src:src+k]. - // We copy one instance of the pattern here, then repeat. - // Each time around this loop k will double. - k := dst - src - copy(mem[dst:dst+k], mem[src:src+k]) - n -= k - dst += k - } -} diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go deleted file mode 100644 index 70a6095e60..0000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go +++ /dev/null @@ -1,41 +0,0 @@ -//+build !noasm -//+build !appengine - -// Copyright 2015, Klaus Post, see LICENSE for details. - -package flate - -import ( - "github.com/klauspost/cpuid" -) - -// crc32sse returns a hash for the first 4 bytes of the slice -// len(a) must be >= 4. -//go:noescape -func crc32sse(a []byte) uint32 - -// crc32sseAll calculates hashes for each 4-byte set in a. -// dst must be east len(a) - 4 in size. -// The size is not checked by the assembly. -//go:noescape -func crc32sseAll(a []byte, dst []uint32) - -// matchLenSSE4 returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -// -// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions. -// -//go:noescape -func matchLenSSE4(a, b []byte, max int) int - -// histogram accumulates a histogram of b in h. -// h must be at least 256 entries in length, -// and must be cleared before calling this function. -//go:noescape -func histogram(b []byte, h []int32) - -// Detect SSE 4.2 feature. -func init() { - useSSE42 = cpuid.CPU.SSE42() -} diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s deleted file mode 100644 index 2fb2079b9d..0000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s +++ /dev/null @@ -1,213 +0,0 @@ -//+build !noasm -//+build !appengine - -// Copyright 2015, Klaus Post, see LICENSE for details. - -// func crc32sse(a []byte) uint32 -TEXT ·crc32sse(SB), 4, $0 - MOVQ a+0(FP), R10 - XORQ BX, BX - - // CRC32 dword (R10), EBX - BYTE $0xF2; BYTE $0x41; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0x1a - - MOVL BX, ret+24(FP) - RET - -// func crc32sseAll(a []byte, dst []uint32) -TEXT ·crc32sseAll(SB), 4, $0 - MOVQ a+0(FP), R8 // R8: src - MOVQ a_len+8(FP), R10 // input length - MOVQ dst+24(FP), R9 // R9: dst - SUBQ $4, R10 - JS end - JZ one_crc - MOVQ R10, R13 - SHRQ $2, R10 // len/4 - ANDQ $3, R13 // len&3 - XORQ BX, BX - ADDQ $1, R13 - TESTQ R10, R10 - JZ rem_loop - -crc_loop: - MOVQ (R8), R11 - XORQ BX, BX - XORQ DX, DX - XORQ DI, DI - MOVQ R11, R12 - SHRQ $8, R11 - MOVQ R12, AX - MOVQ R11, CX - SHRQ $16, R12 - SHRQ $16, R11 - MOVQ R12, SI - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - - // CRC32 ECX, EDX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd1 - - // CRC32 ESI, EDI - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xfe - MOVL BX, (R9) - MOVL DX, 4(R9) - MOVL DI, 8(R9) - - XORQ BX, BX - MOVL R11, AX - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - MOVL BX, 12(R9) - - ADDQ $16, R9 - ADDQ $4, R8 - XORQ BX, BX - SUBQ $1, R10 - JNZ crc_loop - -rem_loop: - MOVL (R8), AX - - // CRC32 EAX, EBX - BYTE $0xF2; BYTE $0x0f - BYTE $0x38; BYTE $0xf1; BYTE $0xd8 - - MOVL BX, (R9) - ADDQ $4, R9 - ADDQ $1, R8 - XORQ BX, BX - SUBQ $1, R13 - JNZ rem_loop - -end: - RET - -one_crc: - MOVQ $1, R13 - XORQ BX, BX - JMP rem_loop - -// func matchLenSSE4(a, b []byte, max int) int -TEXT ·matchLenSSE4(SB), 4, $0 - MOVQ a_base+0(FP), SI - MOVQ b_base+24(FP), DI - MOVQ DI, DX - MOVQ max+48(FP), CX - -cmp8: - // As long as we are 8 or more bytes before the end of max, we can load and - // compare 8 bytes at a time. If those 8 bytes are equal, repeat. - CMPQ CX, $8 - JLT cmp1 - MOVQ (SI), AX - MOVQ (DI), BX - CMPQ AX, BX - JNE bsf - ADDQ $8, SI - ADDQ $8, DI - SUBQ $8, CX - JMP cmp8 - -bsf: - // If those 8 bytes were not equal, XOR the two 8 byte values, and return - // the index of the first byte that differs. The BSF instruction finds the - // least significant 1 bit, the amd64 architecture is little-endian, and - // the shift by 3 converts a bit index to a byte index. - XORQ AX, BX - BSFQ BX, BX - SHRQ $3, BX - ADDQ BX, DI - - // Subtract off &b[0] to convert from &b[ret] to ret, and return. - SUBQ DX, DI - MOVQ DI, ret+56(FP) - RET - -cmp1: - // In the slices' tail, compare 1 byte at a time. - CMPQ CX, $0 - JEQ matchLenEnd - MOVB (SI), AX - MOVB (DI), BX - CMPB AX, BX - JNE matchLenEnd - ADDQ $1, SI - ADDQ $1, DI - SUBQ $1, CX - JMP cmp1 - -matchLenEnd: - // Subtract off &b[0] to convert from &b[ret] to ret, and return. - SUBQ DX, DI - MOVQ DI, ret+56(FP) - RET - -// func histogram(b []byte, h []int32) -TEXT ·histogram(SB), 4, $0 - MOVQ b+0(FP), SI // SI: &b - MOVQ b_len+8(FP), R9 // R9: len(b) - MOVQ h+24(FP), DI // DI: Histogram - MOVQ R9, R8 - SHRQ $3, R8 - JZ hist1 - XORQ R11, R11 - -loop_hist8: - MOVQ (SI), R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - MOVB R10, R11 - INCL (DI)(R11*4) - SHRQ $8, R10 - - INCL (DI)(R10*4) - - ADDQ $8, SI - DECQ R8 - JNZ loop_hist8 - -hist1: - ANDQ $7, R9 - JZ end_hist - XORQ R10, R10 - -loop_hist1: - MOVB (SI), R10 - INCL (DI)(R10*4) - INCQ SI - DECQ R9 - JNZ loop_hist1 - -end_hist: - RET diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go deleted file mode 100644 index bd98bd598f..0000000000 --- a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go +++ /dev/null @@ -1,35 +0,0 @@ -//+build !amd64 noasm appengine - -// Copyright 2015, Klaus Post, see LICENSE for details. - -package flate - -func init() { - useSSE42 = false -} - -// crc32sse should never be called. -func crc32sse(a []byte) uint32 { - panic("no assembler") -} - -// crc32sseAll should never be called. -func crc32sseAll(a []byte, dst []uint32) { - panic("no assembler") -} - -// matchLenSSE4 should never be called. -func matchLenSSE4(a, b []byte, max int) int { - panic("no assembler") - return 0 -} - -// histogram accumulates a histogram of b in h. -// -// len(h) must be >= 256, and h's elements must be all zeroes. -func histogram(b []byte, h []int32) { - h = h[:256] - for _, t := range b { - h[t]++ - } -} diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index 76e9682f7e..20c94f5968 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -50,8 +50,6 @@ const ( skipNever = math.MaxInt32 ) -var useSSE42 bool - type compressionLevel struct { good, lazy, nice, chain, fastSkipHashing, level int } @@ -77,16 +75,14 @@ var levels = []compressionLevel{ {32, 258, 258, 4096, skipNever, 9}, } -type compressor struct { - compressionLevel - - w *huffmanBitWriter - bulkHasher func([]byte, []uint32) - - // compression algorithm - fill func(*compressor, []byte) int // copy data to window - step func(*compressor) // process window - sync bool // requesting flush +// advancedState contains state for the advanced levels, with bigger hash tables, etc. +type advancedState struct { + // deflate state + length int + offset int + hash uint32 + maxInsertIndex int + ii uint16 // position of last match, intended to overflow to reset. // Input hash chains // hashHead[hashValue] contains the largest inputIndex with the specified hash value @@ -99,55 +95,63 @@ type compressor struct { hashOffset int // input window: unprocessed data is window[index:windowEnd] - index int + index int + hashMatch [maxMatchLength + minMatchLength]uint32 +} + +type compressor struct { + compressionLevel + + w *huffmanBitWriter + + // compression algorithm + fill func(*compressor, []byte) int // copy data to window + step func(*compressor) // process window + sync bool // requesting flush + window []byte windowEnd int blockStart int // window index where current tokens start byteAvailable bool // if true, still need to process window[index-1]. + err error // queued output tokens tokens tokens - - // deflate state - length int - offset int - hash uint32 - maxInsertIndex int - err error - ii uint16 // position of last match, intended to overflow to reset. - - snap snappyEnc - hashMatch [maxMatchLength + minMatchLength]uint32 + fast fastEnc + state *advancedState } func (d *compressor) fillDeflate(b []byte) int { - if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) { + s := d.state + if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { // shift the window by windowSize copy(d.window[:], d.window[windowSize:2*windowSize]) - d.index -= windowSize + s.index -= windowSize d.windowEnd -= windowSize if d.blockStart >= windowSize { d.blockStart -= windowSize } else { d.blockStart = math.MaxInt32 } - d.hashOffset += windowSize - if d.hashOffset > maxHashOffset { - delta := d.hashOffset - 1 - d.hashOffset -= delta - d.chainHead -= delta - for i, v := range d.hashPrev { + s.hashOffset += windowSize + if s.hashOffset > maxHashOffset { + delta := s.hashOffset - 1 + s.hashOffset -= delta + s.chainHead -= delta + // Iterate over slices instead of arrays to avoid copying + // the entire table onto the stack (Issue #18625). + for i, v := range s.hashPrev[:] { if int(v) > delta { - d.hashPrev[i] = uint32(int(v) - delta) + s.hashPrev[i] = uint32(int(v) - delta) } else { - d.hashPrev[i] = 0 + s.hashPrev[i] = 0 } } - for i, v := range d.hashHead { + for i, v := range s.hashHead[:] { if int(v) > delta { - d.hashHead[i] = uint32(int(v) - delta) + s.hashHead[i] = uint32(int(v) - delta) } else { - d.hashHead[i] = 0 + s.hashHead[i] = 0 } } } @@ -157,14 +161,14 @@ func (d *compressor) fillDeflate(b []byte) int { return n } -func (d *compressor) writeBlock(tok tokens, index int, eof bool) error { +func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { if index > 0 || eof { var window []byte if d.blockStart <= index { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tok.tokens[:tok.n], eof, window) + d.w.writeBlock(tok, eof, window) return d.w.err } return nil @@ -173,20 +177,20 @@ func (d *compressor) writeBlock(tok tokens, index int, eof bool) error { // writeBlockSkip writes the current block and uses the number of tokens // to determine if the block should be stored on no matches, or // only huffman encoded. -func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error { +func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error { if index > 0 || eof { if d.blockStart <= index { window := d.window[d.blockStart:index] // If we removed less than a 64th of all literals // we huffman compress the block. if int(tok.n) > len(window)-int(tok.n>>6) { - d.w.writeBlockHuff(eof, window) + d.w.writeBlockHuff(eof, window, d.sync) } else { // Write a dynamic huffman block. - d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window) + d.w.writeBlockDynamic(tok, eof, window, d.sync) } } else { - d.w.writeBlock(tok.tokens[:tok.n], eof, nil) + d.w.writeBlock(tok, eof, nil) } d.blockStart = index return d.w.err @@ -201,10 +205,19 @@ func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error { func (d *compressor) fillWindow(b []byte) { // Do not fill window if we are in store-only mode, // use constant or Snappy compression. - switch d.compressionLevel.level { - case 0, 1, 2: + if d.level == 0 { + return + } + if d.fast != nil { + // encode the last data, but discard the result + if len(b) > maxMatchOffset { + b = b[len(b)-maxMatchOffset:] + } + d.fast.Encode(&d.tokens, b) + d.tokens.Reset() return } + s := d.state // If we are given too much, cut it. if len(b) > windowSize { b = b[len(b)-windowSize:] @@ -227,28 +240,28 @@ func (d *compressor) fillWindow(b []byte) { continue } - dst := d.hashMatch[:dstSize] - d.bulkHasher(tocheck, dst) + dst := s.hashMatch[:dstSize] + bulkHash4(tocheck, dst) var newH uint32 for i, val := range dst { di := i + startindex newH = val & hashMask // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] + s.hashPrev[di&windowMask] = s.hashHead[newH] // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) + s.hashHead[newH] = uint32(di + s.hashOffset) } - d.hash = newH + s.hash = newH } // Update window information. d.windowEnd += n - d.index = n + s.index = n } // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. -// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead +// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { minMatchLook := maxMatchLength if lookahead < minMatchLook { @@ -276,7 +289,7 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { - n := matchLen(win[i:], wPos, minMatchLook) + n := matchLen(win[i:i+minMatchLook], wPos) if n > length && (n > minMatchLength || pos-i <= 4096) { length = n @@ -293,62 +306,7 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead // hashPrev[i & windowMask] has already been overwritten, so stop now. break } - i = int(d.hashPrev[i&windowMask]) - d.hashOffset - if i < minIndex || i < 0 { - break - } - } - return -} - -// Try to find a match starting at index whose length is greater than prevSize. -// We only look at chainCount possibilities before giving up. -// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead -func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { - minMatchLook := maxMatchLength - if lookahead < minMatchLook { - minMatchLook = lookahead - } - - win := d.window[0 : pos+minMatchLook] - - // We quit when we get a match that's at least nice long - nice := len(win) - pos - if d.nice < nice { - nice = d.nice - } - - // If we've got a match that's good enough, only look in 1/4 the chain. - tries := d.chain - length = prevLength - if length >= d.good { - tries >>= 2 - } - - wEnd := win[pos+length] - wPos := win[pos:] - minIndex := pos - windowSize - - for i := prevHead; tries > 0; tries-- { - if wEnd == win[i+length] { - n := matchLenSSE4(win[i:], wPos, minMatchLook) - - if n > length && (n > minMatchLength || pos-i <= 4096) { - length = n - offset = pos - i - ok = true - if n >= nice { - // The match is good enough that we don't try to find a better one. - break - } - wEnd = win[pos+n] - } - } - if i == minIndex { - // hashPrev[i & windowMask] has already been overwritten, so stop now. - break - } - i = int(d.hashPrev[i&windowMask]) - d.hashOffset + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset if i < minIndex || i < 0 { break } @@ -364,293 +322,139 @@ func (d *compressor) writeStoredBlock(buf []byte) error { return d.w.err } -const hashmul = 0x1e35a7bd - // hash4 returns a hash representation of the first 4 bytes // of the supplied slice. // The caller must ensure that len(b) >= 4. func hash4(b []byte) uint32 { - return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits) + b = b[:4] + return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits) } // bulkHash4 will compute hashes using the same // algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { - if len(b) < minMatchLength { + if len(b) < 4 { return } hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 - dst[0] = (hb * hashmul) >> (32 - hashBits) - end := len(b) - minMatchLength + 1 + dst[0] = hash4u(hb, hashBits) + end := len(b) - 4 + 1 for i := 1; i < end; i++ { hb = (hb << 8) | uint32(b[i+3]) - dst[i] = (hb * hashmul) >> (32 - hashBits) + dst[i] = hash4u(hb, hashBits) } } -// matchLen returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -func matchLen(a, b []byte, max int) int { - a = a[:max] - b = b[:len(a)] - for i, av := range a { - if b[i] != av { - return i - } - } - return max -} - func (d *compressor) initDeflate() { d.window = make([]byte, 2*windowSize) - d.hashOffset = 1 - d.length = minMatchLength - 1 - d.offset = 0 d.byteAvailable = false - d.index = 0 - d.hash = 0 - d.chainHead = -1 - d.bulkHasher = bulkHash4 - if useSSE42 { - d.bulkHasher = crc32sseAll - } -} - -// Assumes that d.fastSkipHashing != skipNever, -// otherwise use deflateLazy -func (d *compressor) deflate() { - - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { + d.err = nil + if d.state == nil { return } - - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if d.index < d.maxInsertIndex { - d.hash = hash4(d.window[d.index : d.index+minMatchLength]) - } - - for { - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - d.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - if d.tokens.n > 0 { - if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if d.index < d.maxInsertIndex { - // Update the hash - d.hash = hash4(d.window[d.index : d.index+minMatchLength]) - ch := d.hashHead[d.hash&hashMask] - d.chainHead = int(ch) - d.hashPrev[d.index&windowMask] = ch - d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset) - } - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 { - if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset - } - } - if d.length >= minMatchLength { - d.ii = 0 - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3 - d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize)) - d.tokens.n++ - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - if d.length <= d.fastSkipHashing { - var newIndex int - newIndex = d.index + d.length - // Calculate missing hashes - end := newIndex - if end > d.maxInsertIndex { - end = d.maxInsertIndex - } - end += minMatchLength - 1 - startindex := d.index + 1 - if startindex > d.maxInsertIndex { - startindex = d.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := d.hashMatch[:dstSize] - bulkHash4(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] - // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) - } - d.hash = newH - } - d.index = newIndex - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - d.index += d.length - if d.index < d.maxInsertIndex { - d.hash = hash4(d.window[d.index : d.index+minMatchLength]) - } - } - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - d.ii++ - end := d.index + int(d.ii>>uint(d.fastSkipHashing)) + 1 - if end > d.windowEnd { - end = d.windowEnd - } - for i := d.index; i < end; i++ { - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } - d.index = end - } - } + s := d.state + s.index = 0 + s.hashOffset = 1 + s.length = minMatchLength - 1 + s.offset = 0 + s.hash = 0 + s.chainHead = -1 } // deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever, // meaning it always has lazy matching on. func (d *compressor) deflateLazy() { + s := d.state // Sanity enables additional runtime tests. // It's intended to be used during development // to supplement the currently ad-hoc unit tests. const sanity = false - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { + if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { return } - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if d.index < d.maxInsertIndex { - d.hash = hash4(d.window[d.index : d.index+minMatchLength]) + s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) + if s.index < s.maxInsertIndex { + s.hash = hash4(d.window[s.index : s.index+minMatchLength]) } for { - if sanity && d.index > d.windowEnd { + if sanity && s.index > d.windowEnd { panic("index > windowEnd") } - lookahead := d.windowEnd - d.index + lookahead := d.windowEnd - s.index if lookahead < minMatchLength+maxMatchLength { if !d.sync { return } - if sanity && d.index > d.windowEnd { + if sanity && s.index > d.windowEnd { panic("index > windowEnd") } if lookahead == 0 { // Flush current output block if any. if d.byteAvailable { // There is still one pending token that needs to be flushed - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false } if d.tokens.n > 0 { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } return } } - if d.index < d.maxInsertIndex { + if s.index < s.maxInsertIndex { // Update the hash - d.hash = hash4(d.window[d.index : d.index+minMatchLength]) - ch := d.hashHead[d.hash&hashMask] - d.chainHead = int(ch) - d.hashPrev[d.index&windowMask] = ch - d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset) - } - prevLength := d.length - prevOffset := d.offset - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize + s.hash = hash4(d.window[s.index : s.index+minMatchLength]) + ch := s.hashHead[s.hash&hashMask] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset) + } + prevLength := s.length + prevOffset := s.offset + s.length = minMatchLength - 1 + s.offset = 0 + minIndex := s.index - windowSize if minIndex < 0 { minIndex = 0 } - if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { - if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset + if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { + if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { + s.length = newLength + s.offset = newOffset } } - if prevLength >= minMatchLength && d.length <= prevLength { + if prevLength >= minMatchLength && s.length <= prevLength { // There was a match at the previous step, and the current match is // not better. Output the previous match. - d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) - d.tokens.n++ + d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) // Insert in the hash table all strings up to the end of the match. // index and index-1 are already inserted. If there is not enough // lookahead, the last two strings are not inserted into the hash // table. var newIndex int - newIndex = d.index + prevLength - 1 + newIndex = s.index + prevLength - 1 // Calculate missing hashes end := newIndex - if end > d.maxInsertIndex { - end = d.maxInsertIndex + if end > s.maxInsertIndex { + end = s.maxInsertIndex } end += minMatchLength - 1 - startindex := d.index + 1 - if startindex > d.maxInsertIndex { - startindex = d.maxInsertIndex + startindex := s.index + 1 + if startindex > s.maxInsertIndex { + startindex = s.maxInsertIndex } tocheck := d.window[startindex:end] dstSize := len(tocheck) - minMatchLength + 1 if dstSize > 0 { - dst := d.hashMatch[:dstSize] + dst := s.hashMatch[:dstSize] bulkHash4(tocheck, dst) var newH uint32 for i, val := range dst { @@ -658,390 +462,71 @@ func (d *compressor) deflateLazy() { newH = val & hashMask // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] - // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) - } - d.hash = newH - } - - d.index = newIndex - d.byteAvailable = false - d.length = minMatchLength - 1 - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - // Reset, if we got a match this run. - if d.length >= minMatchLength { - d.ii = 0 - } - // We have a byte waiting. Emit it. - if d.byteAvailable { - d.ii++ - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - d.index++ - - // If we have a long run of no matches, skip additional bytes - // Resets when d.ii overflows after 64KB. - if d.ii > 31 { - n := int(d.ii >> 5) - for j := 0; j < n; j++ { - if d.index >= d.windowEnd-1 { - break - } - - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - d.index++ - } - // Flush last byte - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ - d.byteAvailable = false - // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } - } else { - d.index++ - d.byteAvailable = true - } - } - } -} - -// Assumes that d.fastSkipHashing != skipNever, -// otherwise use deflateLazySSE -func (d *compressor) deflateSSE() { - - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { - return - } - - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if d.index < d.maxInsertIndex { - d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask - } - - for { - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - d.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - if d.tokens.n > 0 { - if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if d.index < d.maxInsertIndex { - // Update the hash - d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask - ch := d.hashHead[d.hash] - d.chainHead = int(ch) - d.hashPrev[d.index&windowMask] = ch - d.hashHead[d.hash] = uint32(d.index + d.hashOffset) - } - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 { - if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset - } - } - if d.length >= minMatchLength { - d.ii = 0 - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3 - d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize)) - d.tokens.n++ - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - if d.length <= d.fastSkipHashing { - var newIndex int - newIndex = d.index + d.length - // Calculate missing hashes - end := newIndex - if end > d.maxInsertIndex { - end = d.maxInsertIndex - } - end += minMatchLength - 1 - startindex := d.index + 1 - if startindex > d.maxInsertIndex { - startindex = d.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := d.hashMatch[:dstSize] - - crc32sseAll(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] - // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) - } - d.hash = newH - } - d.index = newIndex - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - d.index += d.length - if d.index < d.maxInsertIndex { - d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask - } - } - if d.tokens.n == maxFlateBlockTokens { - // The block includes the current character - if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } else { - d.ii++ - end := d.index + int(d.ii>>5) + 1 - if end > d.windowEnd { - end = d.windowEnd - } - for i := d.index; i < end; i++ { - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i])) - d.tokens.n++ - if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil { - return - } - d.tokens.n = 0 - } - } - d.index = end - } - } -} - -// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever, -// meaning it always has lazy matching on. -func (d *compressor) deflateLazySSE() { - // Sanity enables additional runtime tests. - // It's intended to be used during development - // to supplement the currently ad-hoc unit tests. - const sanity = false - - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { - return - } - - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if d.index < d.maxInsertIndex { - d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask - } - - for { - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - d.index - if lookahead < minMatchLength+maxMatchLength { - if !d.sync { - return - } - if sanity && d.index > d.windowEnd { - panic("index > windowEnd") - } - if lookahead == 0 { - // Flush current output block if any. - if d.byteAvailable { - // There is still one pending token that needs to be flushed - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ - d.byteAvailable = false - } - if d.tokens.n > 0 { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { - return - } - d.tokens.n = 0 - } - return - } - } - if d.index < d.maxInsertIndex { - // Update the hash - d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask - ch := d.hashHead[d.hash] - d.chainHead = int(ch) - d.hashPrev[d.index&windowMask] = ch - d.hashHead[d.hash] = uint32(d.index + d.hashOffset) - } - prevLength := d.length - prevOffset := d.offset - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize - if minIndex < 0 { - minIndex = 0 - } - - if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { - if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset - } - } - if prevLength >= minMatchLength && d.length <= prevLength { - // There was a match at the previous step, and the current match is - // not better. Output the previous match. - d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) - d.tokens.n++ - - // Insert in the hash table all strings up to the end of the match. - // index and index-1 are already inserted. If there is not enough - // lookahead, the last two strings are not inserted into the hash - // table. - var newIndex int - newIndex = d.index + prevLength - 1 - // Calculate missing hashes - end := newIndex - if end > d.maxInsertIndex { - end = d.maxInsertIndex - } - end += minMatchLength - 1 - startindex := d.index + 1 - if startindex > d.maxInsertIndex { - startindex = d.maxInsertIndex - } - tocheck := d.window[startindex:end] - dstSize := len(tocheck) - minMatchLength + 1 - if dstSize > 0 { - dst := d.hashMatch[:dstSize] - crc32sseAll(tocheck, dst) - var newH uint32 - for i, val := range dst { - di := i + startindex - newH = val & hashMask - // Get previous value with the same hash. - // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] + s.hashPrev[di&windowMask] = s.hashHead[newH] // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) + s.hashHead[newH] = uint32(di + s.hashOffset) } - d.hash = newH + s.hash = newH } - d.index = newIndex + s.index = newIndex d.byteAvailable = false - d.length = minMatchLength - 1 + s.length = minMatchLength - 1 if d.tokens.n == maxFlateBlockTokens { // The block includes the current character - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } } else { // Reset, if we got a match this run. - if d.length >= minMatchLength { - d.ii = 0 + if s.length >= minMatchLength { + s.ii = 0 } // We have a byte waiting. Emit it. if d.byteAvailable { - d.ii++ - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ + s.ii++ + d.tokens.AddLiteral(d.window[s.index-1]) if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } - d.index++ + s.index++ // If we have a long run of no matches, skip additional bytes - // Resets when d.ii overflows after 64KB. - if d.ii > 31 { - n := int(d.ii >> 6) + // Resets when s.ii overflows after 64KB. + if s.ii > 31 { + n := int(s.ii >> 5) for j := 0; j < n; j++ { - if d.index >= d.windowEnd-1 { + if s.index >= d.windowEnd-1 { break } - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } - d.index++ + s.index++ } // Flush last byte - d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1])) - d.tokens.n++ + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false - // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength + // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength if d.tokens.n == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens.n = 0 + d.tokens.Reset() } } } else { - d.index++ + s.index++ d.byteAvailable = true } } @@ -1070,17 +555,17 @@ func (d *compressor) storeHuff() { if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 { return } - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err d.windowEnd = 0 } -// storeHuff will compress and store the currently added data, +// storeFast will compress and store the currently added data, // if enough has been accumulated or we at the end of the stream. // Any error that occurred will be in d.err -func (d *compressor) storeSnappy() { +func (d *compressor) storeFast() { // We only compress if we have maxStoreBlockSize. - if d.windowEnd < maxStoreBlockSize { + if d.windowEnd < len(d.window) { if !d.sync { return } @@ -1091,32 +576,30 @@ func (d *compressor) storeSnappy() { } if d.windowEnd <= 32 { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) - d.tokens.n = 0 - d.windowEnd = 0 } else { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], true) d.err = d.w.err } - d.tokens.n = 0 + d.tokens.Reset() d.windowEnd = 0 - d.snap.Reset() + d.fast.Reset() return } } - d.snap.Encode(&d.tokens, d.window[:d.windowEnd]) + d.fast.Encode(&d.tokens, d.window[:d.windowEnd]) // If we made zero matches, store the block as is. - if int(d.tokens.n) == d.windowEnd { + if d.tokens.n == 0 { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) // If we removed less than 1/16th, huffman compress the block. } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err } else { - d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd]) + d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync) d.err = d.w.err } - d.tokens.n = 0 + d.tokens.Reset() d.windowEnd = 0 } @@ -1161,35 +644,26 @@ func (d *compressor) init(w io.Writer, level int) (err error) { d.fill = (*compressor).fillBlock d.step = (*compressor).store case level == ConstantCompression: + d.w.logReusePenalty = uint(4) d.window = make([]byte, maxStoreBlockSize) d.fill = (*compressor).fillBlock d.step = (*compressor).storeHuff - case level >= 1 && level <= 4: - d.snap = newSnappy(level) - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillBlock - d.step = (*compressor).storeSnappy case level == DefaultCompression: level = 5 fallthrough - case 5 <= level && level <= 9: + case level >= 1 && level <= 6: + d.w.logReusePenalty = uint(level + 1) + d.fast = newFastEnc(level) + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillBlock + d.step = (*compressor).storeFast + case 7 <= level && level <= 9: + d.w.logReusePenalty = uint(level) + d.state = &advancedState{} d.compressionLevel = levels[level] d.initDeflate() d.fill = (*compressor).fillDeflate - if d.fastSkipHashing == skipNever { - if useSSE42 { - d.step = (*compressor).deflateLazySSE - } else { - d.step = (*compressor).deflateLazy - } - } else { - if useSSE42 { - d.step = (*compressor).deflateSSE - } else { - d.step = (*compressor).deflate - - } - } + d.step = (*compressor).deflateLazy default: return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level) } @@ -1202,10 +676,10 @@ func (d *compressor) reset(w io.Writer) { d.sync = false d.err = nil // We only need to reset a few things for Snappy. - if d.snap != nil { - d.snap.Reset() + if d.fast != nil { + d.fast.Reset() d.windowEnd = 0 - d.tokens.n = 0 + d.tokens.Reset() return } switch d.compressionLevel.chain { @@ -1213,22 +687,23 @@ func (d *compressor) reset(w io.Writer) { // level was NoCompression or ConstantCompresssion. d.windowEnd = 0 default: - d.chainHead = -1 - for i := range d.hashHead { - d.hashHead[i] = 0 + s := d.state + s.chainHead = -1 + for i := range s.hashHead { + s.hashHead[i] = 0 } - for i := range d.hashPrev { - d.hashPrev[i] = 0 + for i := range s.hashPrev { + s.hashPrev[i] = 0 } - d.hashOffset = 1 - d.index, d.windowEnd = 0, 0 + s.hashOffset = 1 + s.index, d.windowEnd = 0, 0 d.blockStart, d.byteAvailable = 0, false - d.tokens.n = 0 - d.length = minMatchLength - 1 - d.offset = 0 - d.hash = 0 - d.ii = 0 - d.maxInsertIndex = 0 + d.tokens.Reset() + s.length = minMatchLength - 1 + s.offset = 0 + s.hash = 0 + s.ii = 0 + s.maxInsertIndex = 0 } } diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go new file mode 100644 index 0000000000..b0a470f92e --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -0,0 +1,257 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Modified for deflate by Klaus Post (c) 2015. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +import ( + "fmt" + "math/bits" +) + +type fastEnc interface { + Encode(dst *tokens, src []byte) + Reset() +} + +func newFastEnc(level int) fastEnc { + switch level { + case 1: + return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}} + case 2: + return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}} + case 3: + return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}} + case 4: + return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}} + case 5: + return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}} + case 6: + return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}} + default: + panic("invalid level specified") + } +} + +const ( + tableBits = 16 // Bits used in the table + tableSize = 1 << tableBits // Size of the table + tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. + baseMatchOffset = 1 // The smallest match offset + baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 + maxMatchOffset = 1 << 15 // The largest match offset + + bTableBits = 18 // Bits used in the big tables + bTableSize = 1 << bTableBits // Size of the table + allocHistory = maxMatchOffset * 10 // Size to preallocate for history. + bufferReset = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this. +) + +const ( + prime3bytes = 506832829 + prime4bytes = 2654435761 + prime5bytes = 889523592379 + prime6bytes = 227718039650203 + prime7bytes = 58295818150454627 + prime8bytes = 0xcf1bbcdcb7a56463 +) + +func load32(b []byte, i int) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load64(b []byte, i int) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func load3232(b []byte, i int32) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load6432(b []byte, i int32) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func hash(u uint32) uint32 { + return (u * 0x1e35a7bd) >> tableShift +} + +type tableEntry struct { + val uint32 + offset int32 +} + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastGen struct { + hist []byte + cur int32 +} + +func (e *fastGen) addBlock(src []byte) int32 { + // check if we have space already + if len(e.hist)+len(src) > cap(e.hist) { + if cap(e.hist) == 0 { + e.hist = make([]byte, 0, allocHistory) + } else { + if cap(e.hist) < maxMatchOffset*2 { + panic("unexpected buffer size") + } + // Move down + offset := int32(len(e.hist)) - maxMatchOffset + copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + e.cur += offset + e.hist = e.hist[:maxMatchOffset] + } + } + s := int32(len(e.hist)) + e.hist = append(e.hist, src...) + return s +} + +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> ((32 - h) & 31) +} + +type tableEntryPrev struct { + Cur tableEntry + Prev tableEntry +} + +// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4x64(u uint64, h uint8) uint32 { + return (uint32(u) * prime4bytes) >> ((32 - h) & 31) +} + +// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash7(u uint64, h uint8) uint32 { + return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) +} + +// hash8 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash8(u uint64, h uint8) uint32 { + return uint32((u * prime8bytes) >> ((64 - h) & 63)) +} + +// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <64. +func hash6(u uint64, h uint8) uint32 { + return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) +} + +// matchlen will return the match length between offsets and t in src. +// The maximum length returned is maxMatchLength - 4. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlen(s, t int32, src []byte) int32 { + if debugDecode { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > maxMatchOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + s1 := int(s) + maxMatchLength - 4 + if s1 > len(src) { + s1 = len(src) + } + + // Extend the match to be as long as possible. + return int32(matchLen(src[s:s1], src[t:])) +} + +// matchlenLong will return the match length between offsets and t in src. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { + if debugDecode { + if t >= s { + panic(fmt.Sprint("t >=s:", t, s)) + } + if int(s) >= len(src) { + panic(fmt.Sprint("s >= len(src):", s, len(src))) + } + if t < 0 { + panic(fmt.Sprint("t < 0:", t)) + } + if s-t > maxMatchOffset { + panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) + } + } + // Extend the match to be as long as possible. + return int32(matchLen(src[s:], src[t:])) +} + +// Reset the encoding table. +func (e *fastGen) Reset() { + if cap(e.hist) < int(maxMatchOffset*8) { + l := maxMatchOffset * 8 + // Make it at least 1MB. + if l < 1<<20 { + l = 1 << 20 + } + e.hist = make([]byte, 0, l) + } + // We offset current position so everything will be out of reach + e.cur += maxMatchOffset + int32(len(e.hist)) + e.hist = e.hist[:0] +} + +// matchLen returns the maximum length. +// 'a' must be the shortest of the two. +func matchLen(a, b []byte) int { + b = b[:len(a)] + var checked int + if len(a) > 4 { + // Try 4 bytes first + if diff := load32(a, 0) ^ load32(b, 0); diff != 0 { + return bits.TrailingZeros32(diff) >> 3 + } + // Switch to 8 byte matching. + checked = 4 + a = a[4:] + b = b[4:] + for len(a) >= 8 { + b = b[:len(a)] + if diff := load64(a, 0) ^ load64(b, 0); diff != 0 { + return checked + (bits.TrailingZeros64(diff) >> 3) + } + checked += 8 + a = a[8:] + b = b[8:] + } + } + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + return int(i) + checked + } + } + return len(a) + checked +} diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index f9b2a699a3..5ed476aa0d 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -35,7 +35,7 @@ const ( ) // The number of extra bits needed by length code X - LENGTH_CODES_START. -var lengthExtraBits = []int8{ +var lengthExtraBits = [32]int8{ /* 257 */ 0, 0, 0, /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, @@ -43,14 +43,14 @@ var lengthExtraBits = []int8{ } // The length indicated by length code X - LENGTH_CODES_START. -var lengthBase = []uint32{ +var lengthBase = [32]uint8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 255, } // offset code word extra bits. -var offsetExtraBits = []int8{ +var offsetExtraBits = [64]int8{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, @@ -58,7 +58,7 @@ var offsetExtraBits = []int8{ 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, } -var offsetBase = []uint32{ +var offsetBase = [64]uint32{ /* normal deflate */ 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, @@ -85,26 +85,48 @@ type huffmanBitWriter struct { // Data waiting to be written is bytes[0:nbytes] // and then the low nbits of bits. bits uint64 - nbits uint - bytes [bufferSize]byte - codegenFreq [codegenCodeCount]int32 - nbytes int - literalFreq []int32 - offsetFreq []int32 - codegen []uint8 + nbits uint16 + nbytes uint8 literalEncoding *huffmanEncoder offsetEncoding *huffmanEncoder codegenEncoding *huffmanEncoder err error + lastHeader int + // Set between 0 (reused block can be up to 2x the size) + logReusePenalty uint + lastHuffMan bool + bytes [256]byte + literalFreq [lengthCodesStart + 32]uint16 + offsetFreq [32]uint16 + codegenFreq [codegenCodeCount]uint16 + + // codegen must have an extra space for the final symbol. + codegen [literalCount + offsetCodeCount + 1]uint8 } +// Huffman reuse. +// +// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections. +// +// This is controlled by several variables: +// +// If lastHeader is non-zero the Huffman table can be reused. +// This also indicates that a Huffman table has been generated that can output all +// possible symbols. +// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated +// an EOB with the previous table must be written. +// +// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid. +// +// An incoming block estimates the output size of a new table using a 'fresh' by calculating the +// optimal size and adding a penalty in 'logReusePenalty'. +// A Huffman table is not optimal, which is why we add a penalty, and generating a new table +// is slower both for compression and decompression. + func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { return &huffmanBitWriter{ writer: w, - literalFreq: make([]int32, maxNumLit), - offsetFreq: make([]int32, offsetCodeCount), - codegen: make([]uint8, maxNumLit+offsetCodeCount+1), - literalEncoding: newHuffmanEncoder(maxNumLit), + literalEncoding: newHuffmanEncoder(literalCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount), } @@ -113,7 +135,42 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { func (w *huffmanBitWriter) reset(writer io.Writer) { w.writer = writer w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil - w.bytes = [bufferSize]byte{} + w.bytes = [256]byte{} + w.lastHeader = 0 + w.lastHuffMan = false +} + +func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) { + offsets, lits = true, true + a := t.offHist[:offsetCodeCount] + b := w.offsetFreq[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + offsets = false + break + } + } + + a = t.extraHist[:literalCount-256] + b = w.literalFreq[256:literalCount] + b = b[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + lits = false + break + } + } + if lits { + a = t.litHist[:] + b = w.literalFreq[:len(a)] + for i := range a { + if b[i] == 0 && a[i] != 0 { + lits = false + break + } + } + } + return } func (w *huffmanBitWriter) flush() { @@ -144,30 +201,11 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint) { - if w.err != nil { - return - } - w.bits |= uint64(b) << w.nbits +func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) - n = 0 - } - w.nbytes = n + w.writeOutBits() } } @@ -213,7 +251,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE // a copy of the frequencies, and as the place where we put the result. // This is fine because the output is always shorter than the input used // so far. - codegen := w.codegen // cache + codegen := w.codegen[:] // cache // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { @@ -292,30 +330,54 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } -// dynamicSize returns the size of dynamically encoded data in bits. -func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { +func (w *huffmanBitWriter) codegens() int { + numCodegens := len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + return numCodegens +} + +func (w *huffmanBitWriter) headerSize() (size, numCodegens int) { numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } - header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + return 3 + 5 + 5 + 4 + (3 * numCodegens) + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int(w.codegenFreq[16])*2 + int(w.codegenFreq[17])*3 + - int(w.codegenFreq[18])*7 + int(w.codegenFreq[18])*7, numCodegens +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + header, numCodegens := w.headerSize() size = header + - litEnc.bitLength(w.literalFreq) + - offEnc.bitLength(w.offsetFreq) + + litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + extraBits - return size, numCodegens } +// extraBitSize will return the number of bits that will be written +// as "extra" bits on matches. +func (w *huffmanBitWriter) extraBitSize() int { + total := 0 + for i, n := range w.literalFreq[257:literalCount] { + total += int(n) * int(lengthExtraBits[i&31]) + } + for i, n := range w.offsetFreq[:offsetCodeCount] { + total += int(n) * int(offsetExtraBits[i&31]) + } + return total +} + // fixedSize returns the size of dynamically encoded data in bits. func (w *huffmanBitWriter) fixedSize(extraBits int) int { return 3 + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + + fixedLiteralEncoding.bitLength(w.literalFreq[:]) + + fixedOffsetEncoding.bitLength(w.offsetFreq[:]) + extraBits } @@ -333,30 +395,36 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { } func (w *huffmanBitWriter) writeCode(c hcode) { - if w.err != nil { - return - } + // The function does not get inlined if we "& 63" the shift. w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) + w.nbits += c.len if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) + w.writeOutBits() + } +} + +// writeOutBits will write bits to the buffer. +func (w *huffmanBitWriter) writeOutBits() { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + w.bytes[n] = byte(bits) + w.bytes[n+1] = byte(bits >> 8) + w.bytes[n+2] = byte(bits >> 16) + w.bytes[n+3] = byte(bits >> 24) + w.bytes[n+4] = byte(bits >> 32) + w.bytes[n+5] = byte(bits >> 40) + n += 6 + if n >= bufferFlushSize { + if w.err != nil { n = 0 + return } - w.nbytes = n + w.write(w.bytes[:n]) + n = 0 } + w.nbytes = n } // Write the header of a dynamic Huffman block to the output stream. @@ -412,6 +480,11 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } var flag int32 if isEof { flag = 1 @@ -426,6 +499,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + // Indicate that we are a fixed Huffman block var value int32 = 2 if isEof { @@ -439,29 +518,23 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { // is larger than the original bytes, the data will be written as a // stored block. // If the input is nil, the tokens will always be Huffman encoded. -func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) - numLiterals, numOffsets := w.indexTokens(tokens) - + tokens.AddEOB() + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + numLiterals, numOffsets := w.indexTokens(tokens, false) + w.generate(tokens) var extraBits int storedSize, storable := w.storedSize(input) if storable { - // We only bother calculating the costs of the extra bits required by - // the length of offset fields (which will be the same for both fixed - // and dynamic encoding), if we need to compare those two encodings - // against stored encoding. - for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { - // First eight length codes have extra size = 0. - extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) - } - for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { - // First four offset codes have extra size = 0. - extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode]) - } + extraBits = w.extraBitSize() } // Figure out smallest code. @@ -500,7 +573,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Write the tokens. - w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) + w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes) } // writeBlockDynamic encodes a block using a dynamic Huffman table. @@ -508,57 +581,103 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. -func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) - numLiterals, numOffsets := w.indexTokens(tokens) + sync = sync || eof + if sync { + tokens.AddEOB() + } - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0) + // We cannot reuse pure huffman table. + if w.lastHuffMan && w.lastHeader > 0 { + // We will not try to reuse. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } + if !sync { + tokens.Fill() + } + numLiterals, numOffsets := w.indexTokens(tokens, !sync) - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + var size int + // Check if we should reuse. + if w.lastHeader > 0 { + // Estimate size for using a new table + newSize := w.lastHeader + tokens.EstimatedBits() + + // The estimated size is calculated as an optimal table. + // We add a penalty to make it more realistic and re-use a bit more. + newSize += newSize >> (w.logReusePenalty & 31) + extra := w.extraBitSize() + reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra) + + // Check if a new table is better. + if newSize < reuseSize { + // Write the EOB we owe. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + size = newSize + w.lastHeader = 0 + } else { + size = reuseSize + } + // Check if we get a reasonable size decrease. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + w.lastHeader = 0 + return + } } - // Write Huffman table. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + // We want a new block/table + if w.lastHeader == 0 { + w.generate(tokens) + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + var numCodegens int + size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize()) + // Store bytes, if we don't get a reasonable improvement. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + w.lastHeader = 0 + return + } + // Write Huffman table. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHeader, _ = w.headerSize() + w.lastHuffMan = false + } + + if sync { + w.lastHeader = 0 + } // Write the tokens. - w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes) + w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes) } // indexTokens indexes a slice of tokens, and updates // literalFreq and offsetFreq, and generates literalEncoding // and offsetEncoding. // The number of literal and offset tokens is returned. -func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) { - for i := range w.literalFreq { - w.literalFreq[i] = 0 - } - for i := range w.offsetFreq { - w.offsetFreq[i] = 0 - } +func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) { + copy(w.literalFreq[:], t.litHist[:]) + copy(w.literalFreq[256:], t.extraHist[:]) + copy(w.offsetFreq[:], t.offHist[:offsetCodeCount]) - for _, t := range tokens { - if t < matchType { - w.literalFreq[t.literal()]++ - continue - } - length := t.length() - offset := t.offset() - w.literalFreq[lengthCodesStart+lengthCode(length)]++ - w.offsetFreq[offsetCode(offset)]++ + if t.n == 0 { + return + } + if filled { + return maxNumLit, maxNumDist } - // get the number of literals numLiterals = len(w.literalFreq) for w.literalFreq[numLiterals-1] == 0 { @@ -575,41 +694,85 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets w.offsetFreq[0] = 1 numOffsets = 1 } - w.literalEncoding.generate(w.literalFreq, 15) - w.offsetEncoding.generate(w.offsetFreq, 15) return } +func (w *huffmanBitWriter) generate(t *tokens) { + w.literalEncoding.generate(w.literalFreq[:literalCount], 15) + w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) +} + // writeTokens writes a slice of tokens to the output. // codes for literal and offset encoding must be supplied. func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) { if w.err != nil { return } + if len(tokens) == 0 { + return + } + + // Only last token should be endBlockMarker. + var deferEOB bool + if tokens[len(tokens)-1] == endBlockMarker { + tokens = tokens[:len(tokens)-1] + deferEOB = true + } + + // Create slices up to the next power of two to avoid bounds checks. + lits := leCodes[:256] + offs := oeCodes[:32] + lengths := leCodes[lengthCodesStart:] + lengths = lengths[:32] for _, t := range tokens { if t < matchType { - w.writeCode(leCodes[t.literal()]) + w.writeCode(lits[t.literal()]) continue } + // Write the length length := t.length() lengthCode := lengthCode(length) - w.writeCode(leCodes[lengthCode+lengthCodesStart]) - extraLengthBits := uint(lengthExtraBits[lengthCode]) + if false { + w.writeCode(lengths[lengthCode&31]) + } else { + // inlined + c := lengths[lengthCode&31] + w.bits |= uint64(c.code) << (w.nbits & 63) + w.nbits += c.len + if w.nbits >= 48 { + w.writeOutBits() + } + } + + extraLengthBits := uint16(lengthExtraBits[lengthCode&31]) if extraLengthBits > 0 { - extraLength := int32(length - lengthBase[lengthCode]) + extraLength := int32(length - lengthBase[lengthCode&31]) w.writeBits(extraLength, extraLengthBits) } // Write the offset offset := t.offset() offsetCode := offsetCode(offset) - w.writeCode(oeCodes[offsetCode]) - extraOffsetBits := uint(offsetExtraBits[offsetCode]) + if false { + w.writeCode(offs[offsetCode&31]) + } else { + // inlined + c := offs[offsetCode&31] + w.bits |= uint64(c.code) << (w.nbits & 63) + w.nbits += c.len + if w.nbits >= 48 { + w.writeOutBits() + } + } + extraOffsetBits := uint16(offsetExtraBits[offsetCode&63]) if extraOffsetBits > 0 { - extraOffset := int32(offset - offsetBase[offsetCode]) + extraOffset := int32(offset - offsetBase[offsetCode&63]) w.writeBits(extraOffset, extraOffsetBits) } } + if deferEOB { + w.writeCode(leCodes[endBlockMarker]) + } } // huffOffset is a static offset encoder used for huffman only encoding. @@ -620,82 +783,99 @@ func init() { w := newHuffmanBitWriter(nil) w.offsetFreq[0] = 1 huffOffset = newHuffmanEncoder(offsetCodeCount) - huffOffset.generate(w.offsetFreq, 15) + huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15) } // writeBlockHuff encodes a block of bytes as either // Huffman encoded literals or uncompressed bytes if the // results only gains very little from compression. -func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if w.err != nil { return } // Clear histogram - for i := range w.literalFreq { + for i := range w.literalFreq[:] { w.literalFreq[i] = 0 } + if !w.lastHuffMan { + for i := range w.offsetFreq[:] { + w.offsetFreq[i] = 0 + } + } // Add everything as literals - histogram(input, w.literalFreq) + estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15 - w.literalFreq[endBlockMarker] = 1 + // Store bytes, if we don't get a reasonable improvement. + ssize, storable := w.storedSize(input) + if storable && ssize < (estBits+estBits>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } - const numLiterals = endBlockMarker + 1 - const numOffsets = 1 + if w.lastHeader > 0 { + size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader) + estBits += estBits >> (w.logReusePenalty) - w.literalEncoding.generate(w.literalFreq, 15) + if estBits < size { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + } - // Figure out smallest code. - // Always use dynamic Huffman or Store - var numCodegens int + const numLiterals = endBlockMarker + 1 + const numOffsets = 1 + if w.lastHeader == 0 { + w.literalFreq[endBlockMarker] = 1 + w.literalEncoding.generate(w.literalFreq[:numLiterals], 15) - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + numCodegens := w.codegens() - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + // Huffman. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHuffMan = true + w.lastHeader, _ = w.headerSize() } - // Huffman. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) encoding := w.literalEncoding.codes[:257] - n := w.nbytes for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) - if w.nbits < 48 { - continue - } - // Store 6 bytes - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n < bufferFlushSize { - continue - } - w.write(w.bytes[:n]) - if w.err != nil { - return // Return early in the event of write failures + w.bits |= uint64(c.code) << ((w.nbits) & 63) + w.nbits += c.len + if w.nbits >= 48 { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + w.bytes[n] = byte(bits) + w.bytes[n+1] = byte(bits >> 8) + w.bytes[n+2] = byte(bits >> 16) + w.bytes[n+3] = byte(bits >> 24) + w.bytes[n+4] = byte(bits >> 32) + w.bytes[n+5] = byte(bits >> 40) + n += 6 + if n >= bufferFlushSize { + if w.err != nil { + n = 0 + return + } + w.write(w.bytes[:n]) + n = 0 + } + w.nbytes = n } - n = 0 } - w.nbytes = n - w.writeCode(encoding[endBlockMarker]) + if eof || sync { + w.writeCode(encoding[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } } diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index bdcbd823b0..d0099599c5 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -6,9 +6,16 @@ package flate import ( "math" + "math/bits" "sort" ) +const ( + maxBitsLimit = 16 + // number of valid literals + literalCount = 286 +) + // hcode is a huffman code with a bit code and bit length. type hcode struct { code, len uint16 @@ -24,7 +31,7 @@ type huffmanEncoder struct { type literalNode struct { literal uint16 - freq int32 + freq uint16 } // A levelInfo describes the state of the constructed tree for a given depth. @@ -53,18 +60,24 @@ func (h *hcode) set(code uint16, length uint16) { h.code = code } -func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} } +func reverseBits(number uint16, bitLength byte) uint16 { + return bits.Reverse16(number << ((16 - bitLength) & 15)) +} + +func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} } func newHuffmanEncoder(size int) *huffmanEncoder { - return &huffmanEncoder{codes: make([]hcode, size)} + // Make capacity to next power of two. + c := uint(bits.Len32(uint32(size - 1))) + return &huffmanEncoder{codes: make([]hcode, size, 1<<c)} } // Generates a HuffmanCode corresponding to the fixed literal table func generateFixedLiteralEncoding() *huffmanEncoder { - h := newHuffmanEncoder(maxNumLit) + h := newHuffmanEncoder(literalCount) codes := h.codes var ch uint16 - for ch = 0; ch < maxNumLit; ch++ { + for ch = 0; ch < literalCount; ch++ { var bits uint16 var size uint16 switch { @@ -105,7 +118,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder { var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding() var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding() -func (h *huffmanEncoder) bitLength(freq []int32) int { +func (h *huffmanEncoder) bitLength(freq []uint16) int { var total int for i, f := range freq { if f != 0 { @@ -115,8 +128,6 @@ func (h *huffmanEncoder) bitLength(freq []int32) int { return total } -const maxBitsLimit = 16 - // Return the number of literals assigned to each bit size in the Huffman encoding // // This method is only called when list.length >= 3 @@ -160,9 +171,9 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: list[1].freq, - nextCharFreq: list[2].freq, - nextPairFreq: list[0].freq + list[1].freq, + lastFreq: int32(list[1].freq), + nextCharFreq: int32(list[2].freq), + nextPairFreq: int32(list[0].freq) + int32(list[1].freq), } leafCounts[level][level] = 2 if level == 1 { @@ -194,7 +205,12 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { l.lastFreq = l.nextCharFreq // Lower leafCounts are the same of the previous node. leafCounts[level][level] = n - l.nextCharFreq = list[n].freq + e := list[n] + if e.literal < math.MaxUint16 { + l.nextCharFreq = int32(e.freq) + } else { + l.nextCharFreq = math.MaxInt32 + } } else { // The next item on this row is a pair from the previous row. // nextPairFreq isn't valid until we generate two @@ -270,12 +286,12 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // // freq An array of frequencies, in which frequency[i] gives the frequency of literal i. // maxBits The maximum number of bits to use for any literal. -func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { +func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { if h.freqcache == nil { // Allocate a reusable buffer with the longest possible frequency table. - // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit. - // The largest of these is maxNumLit, so we allocate for that case. - h.freqcache = make([]literalNode, maxNumLit+1) + // Possible lengths are codegenCodeCount, offsetCodeCount and literalCount. + // The largest of these is literalCount, so we allocate for that case. + h.freqcache = make([]literalNode, literalCount+1) } list := h.freqcache[:len(freq)+1] // Number of non-zero literals @@ -342,3 +358,27 @@ func (s byFreq) Less(i, j int) bool { } func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// histogramSize accumulates a histogram of b in h. +// An estimated size in bits is returned. +// Unassigned values are assigned '1' in the histogram. +// len(h) must be >= 256, and h's elements must be all zeroes. +func histogramSize(b []byte, h []uint16, fill bool) int { + h = h[:256] + for _, t := range b { + h[t]++ + } + invTotal := 1.0 / float64(len(b)) + shannon := 0.0 + single := math.Ceil(-math.Log2(invTotal)) + for i, v := range h[:] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + } else if fill { + shannon += single + h[i] = 1 + } + } + return int(shannon + 0.99) +} diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go index 53b63d9a0b..6dc5b5d06e 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate.go +++ b/vendor/github.com/klauspost/compress/flate/inflate.go @@ -9,19 +9,24 @@ package flate import ( "bufio" + "fmt" "io" + "math/bits" "strconv" "sync" ) const ( - maxCodeLen = 16 // max length of Huffman code + maxCodeLen = 16 // max length of Huffman code + maxCodeLenMask = 15 // mask for max length of Huffman code // The next three numbers come from the RFC section 3.2.7, with the // additional proviso in section 3.2.5 which implies that distance codes // 30 and 31 should never occur in compressed data. maxNumLit = 286 maxNumDist = 30 numCodes = 19 // number of codes in Huffman meta-code + + debugDecode = false ) // Initialize the fixedHuffmanDecoder only once upon first use. @@ -101,10 +106,10 @@ const ( ) type huffmanDecoder struct { - min int // the minimum code length - chunks [huffmanNumChunks]uint32 // chunks as described above - links [][]uint32 // overflow links - linkMask uint32 // mask the width of the link table + min int // the minimum code length + chunks *[huffmanNumChunks]uint16 // chunks as described above + links [][]uint16 // overflow links + linkMask uint32 // mask the width of the link table } // Initialize Huffman decoding tables from array of code lengths. @@ -112,21 +117,24 @@ type huffmanDecoder struct { // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a // degenerate case where the tree has only a single symbol with length 1. Empty // trees are permitted. -func (h *huffmanDecoder) init(bits []int) bool { +func (h *huffmanDecoder) init(lengths []int) bool { // Sanity enables additional runtime tests during Huffman // table construction. It's intended to be used during // development to supplement the currently ad-hoc unit tests. const sanity = false + if h.chunks == nil { + h.chunks = &[huffmanNumChunks]uint16{} + } if h.min != 0 { - *h = huffmanDecoder{} + *h = huffmanDecoder{chunks: h.chunks, links: h.links} } // Count number of codes of each length, // compute min and max length. var count [maxCodeLen]int var min, max int - for _, n := range bits { + for _, n := range lengths { if n == 0 { continue } @@ -136,7 +144,7 @@ func (h *huffmanDecoder) init(bits []int) bool { if n > max { max = n } - count[n]++ + count[n&maxCodeLenMask]++ } // Empty tree. The decompressor.huffSym function will fail later if the tree @@ -154,8 +162,8 @@ func (h *huffmanDecoder) init(bits []int) bool { var nextcode [maxCodeLen]int for i := min; i <= max; i++ { code <<= 1 - nextcode[i] = code - code += count[i] + nextcode[i&maxCodeLenMask] = code + code += count[i&maxCodeLenMask] } // Check that the coding is complete (i.e., that we've @@ -164,37 +172,56 @@ func (h *huffmanDecoder) init(bits []int) bool { // accept degenerate single-code codings. See also // TestDegenerateHuffmanCoding. if code != 1<<uint(max) && !(code == 1 && max == 1) { + if debugDecode { + fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)") + } return false } h.min = min + chunks := h.chunks[:] + for i := range chunks { + chunks[i] = 0 + } + if max > huffmanChunkBits { numLinks := 1 << (uint(max) - huffmanChunkBits) h.linkMask = uint32(numLinks - 1) // create link tables link := nextcode[huffmanChunkBits+1] >> 1 - h.links = make([][]uint32, huffmanNumChunks-link) + if cap(h.links) < huffmanNumChunks-link { + h.links = make([][]uint16, huffmanNumChunks-link) + } else { + h.links = h.links[:huffmanNumChunks-link] + } for j := uint(link); j < huffmanNumChunks; j++ { - reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8 + reverse := int(bits.Reverse16(uint16(j))) reverse >>= uint(16 - huffmanChunkBits) off := j - uint(link) if sanity && h.chunks[reverse] != 0 { panic("impossible: overwriting existing chunk") } - h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1)) - h.links[off] = make([]uint32, numLinks) + h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1)) + if cap(h.links[off]) < numLinks { + h.links[off] = make([]uint16, numLinks) + } else { + links := h.links[off][:0] + h.links[off] = links[:numLinks] + } } + } else { + h.links = h.links[:0] } - for i, n := range bits { + for i, n := range lengths { if n == 0 { continue } code := nextcode[n] nextcode[n]++ - chunk := uint32(i<<huffmanValueShift | n) - reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8 + chunk := uint16(i<<huffmanValueShift | n) + reverse := int(bits.Reverse16(uint16(code))) reverse >>= uint(16 - n) if n <= huffmanChunkBits { for off := reverse; off < len(h.chunks); off += 1 << uint(n) { @@ -326,6 +353,9 @@ func (f *decompressor) nextBlock() { f.huffmanBlock() default: // 3 is reserved. + if debugDecode { + fmt.Println("reserved data block encountered") + } f.err = CorruptInputError(f.roffset) } } @@ -404,11 +434,17 @@ func (f *decompressor) readHuffman() error { } nlit := int(f.b&0x1F) + 257 if nlit > maxNumLit { + if debugDecode { + fmt.Println("nlit > maxNumLit", nlit) + } return CorruptInputError(f.roffset) } f.b >>= 5 ndist := int(f.b&0x1F) + 1 if ndist > maxNumDist { + if debugDecode { + fmt.Println("ndist > maxNumDist", ndist) + } return CorruptInputError(f.roffset) } f.b >>= 5 @@ -432,6 +468,9 @@ func (f *decompressor) readHuffman() error { f.codebits[codeOrder[i]] = 0 } if !f.h1.init(f.codebits[0:]) { + if debugDecode { + fmt.Println("init codebits failed") + } return CorruptInputError(f.roffset) } @@ -459,6 +498,9 @@ func (f *decompressor) readHuffman() error { rep = 3 nb = 2 if i == 0 { + if debugDecode { + fmt.Println("i==0") + } return CorruptInputError(f.roffset) } b = f.bits[i-1] @@ -473,6 +515,9 @@ func (f *decompressor) readHuffman() error { } for f.nb < nb { if err := f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits:", err) + } return err } } @@ -480,6 +525,9 @@ func (f *decompressor) readHuffman() error { f.b >>= nb f.nb -= nb if i+rep > n { + if debugDecode { + fmt.Println("i+rep > n", i, rep, n) + } return CorruptInputError(f.roffset) } for j := 0; j < rep; j++ { @@ -489,6 +537,9 @@ func (f *decompressor) readHuffman() error { } if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) { + if debugDecode { + fmt.Println("init2 failed") + } return CorruptInputError(f.roffset) } @@ -566,12 +617,18 @@ readLiteral: length = 258 n = 0 default: + if debugDecode { + fmt.Println(v, ">= maxNumLit") + } f.err = CorruptInputError(f.roffset) return } if n > 0 { for f.nb < n { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits n>0:", err) + } f.err = err return } @@ -585,15 +642,21 @@ readLiteral: if f.hd == nil { for f.nb < 5 { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits f.nb<5:", err) + } f.err = err return } } - dist = int(reverseByte[(f.b&0x1F)<<3]) + dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { if dist, err = f.huffSym(f.hd); err != nil { + if debugDecode { + fmt.Println("huffsym:", err) + } f.err = err return } @@ -608,6 +671,9 @@ readLiteral: extra := (dist & 1) << nb for f.nb < nb { if err = f.moreBits(); err != nil { + if debugDecode { + fmt.Println("morebits f.nb<nb:", err) + } f.err = err return } @@ -617,12 +683,18 @@ readLiteral: f.nb -= nb dist = 1<<(nb+1) + 1 + extra default: + if debugDecode { + fmt.Println("dist too big:", dist, maxNumDist) + } f.err = CorruptInputError(f.roffset) return } // No check on length; encoding can be prescient. if dist > f.dict.histSize() { + if debugDecode { + fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + } f.err = CorruptInputError(f.roffset) return } @@ -661,15 +733,15 @@ func (f *decompressor) dataBlock() { nr, err := io.ReadFull(f.r, f.buf[0:4]) f.roffset += int64(nr) if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - f.err = err + f.err = noEOF(err) return } n := int(f.buf[0]) | int(f.buf[1])<<8 nn := int(f.buf[2]) | int(f.buf[3])<<8 if uint16(nn) != uint16(^n) { + if debugDecode { + fmt.Println("uint16(nn) != uint16(^n)", nn, ^n) + } f.err = CorruptInputError(f.roffset) return } @@ -697,10 +769,7 @@ func (f *decompressor) copyData() { f.copyLen -= cnt f.dict.writeMark(cnt) if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - f.err = err + f.err = noEOF(err) return } @@ -722,13 +791,18 @@ func (f *decompressor) finishBlock() { f.step = (*decompressor).nextBlock } +// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF. +func noEOF(e error) error { + if e == io.EOF { + return io.ErrUnexpectedEOF + } + return e +} + func (f *decompressor) moreBits() error { c, err := f.r.ReadByte() if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - return err + return noEOF(err) } f.roffset++ f.b |= uint32(c) << f.nb @@ -743,25 +817,40 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(h.min) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + nb, b := f.nb, f.b for { - for f.nb < n { - if err := f.moreBits(); err != nil { - return 0, err + for nb < n { + c, err := f.r.ReadByte() + if err != nil { + f.b = b + f.nb = nb + return 0, noEOF(err) } + f.roffset++ + b |= uint32(c) << (nb & 31) + nb += 8 } - chunk := h.chunks[f.b&(huffmanNumChunks-1)] + chunk := h.chunks[b&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask] + chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= f.nb { + if n <= nb { if n == 0 { + f.b = b + f.nb = nb + if debugDecode { + fmt.Println("huffsym: n==0") + } f.err = CorruptInputError(f.roffset) return 0, f.err } - f.b >>= n - f.nb -= n + f.b = b >> (n & 31) + f.nb = nb - n return int(chunk >> huffmanValueShift), nil } } @@ -799,6 +888,8 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error { r: makeReader(r), bits: f.bits, codebits: f.codebits, + h1: f.h1, + h2: f.h2, dict: f.dict, step: (*decompressor).nextBlock, } diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go new file mode 100644 index 0000000000..20de8f11f4 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -0,0 +1,174 @@ +package flate + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastEncL1 struct { + fastGen + table [tableSize]tableEntry +} + +// EncodeL1 uses a similar algorithm to level 1 +func (e *fastEncL1) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hash(cv) + candidate = e.table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + + now := load6432(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + nextHash = hash(uint32(now)) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + cv = uint32(now) + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + // Save the match found + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + if s >= sLimit { + // Index first pair after match end. + if int(s+l+4) < len(src) { + cv := load3232(src, s) + e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv} + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6432(src, s-2) + o := e.cur + s - 2 + prevHash := hash(uint32(x)) + e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} + x >>= 16 + currHash := hash(uint32(x)) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x) != candidate.val { + cv = uint32(x >> 8) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go new file mode 100644 index 0000000000..7c824431e6 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level2.go @@ -0,0 +1,199 @@ +package flate + +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastEncL2 struct { + fastGen + table [bTableSize]tableEntry +} + +// EncodeL2 uses a similar algorithm to level 1, but is capable +// of matching across blocks giving better compression at a small slowdown. +func (e *fastEncL2) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + for { + // When should we start skipping if we haven't found matches in a long while. + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hash4u(cv, bTableBits) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidate = e.table[nextHash] + now := load6432(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + nextHash = hash4u(uint32(now), bTableBits) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && cv == candidate.val { + break + } + cv = uint32(now) + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+l+4) < len(src) { + cv := load3232(src, s) + e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv} + } + goto emitRemainder + } + + // Store every second hash in-between, but offset by 1. + for i := s - l + 2; i < s-5; i += 7 { + x := load6432(src, int32(i)) + nextHash := hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)} + // Skip one + x >>= 16 + nextHash = hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)} + // Skip one + x >>= 16 + nextHash = hash4u(uint32(x), bTableBits) + e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)} + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6432(src, s-2) + o := e.cur + s - 2 + prevHash := hash4u(uint32(x), bTableBits) + prevHash2 := hash4u(uint32(x>>8), bTableBits) + e.table[prevHash] = tableEntry{offset: o, val: uint32(x)} + e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)} + currHash := hash4u(uint32(x>>16), bTableBits) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x>>16) != candidate.val { + cv = uint32(x >> 24) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go new file mode 100644 index 0000000000..4153d24c95 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -0,0 +1,225 @@ +package flate + +// fastEncL3 +type fastEncL3 struct { + fastGen + table [tableSize]tableEntryPrev +} + +// Encode uses a similar algorithm to level 2, will check up to two candidates. +func (e *fastEncL3) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 8 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + } + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + e.table[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // Skip if too small. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3232(src, s) + for { + const skipLog = 6 + nextS := s + var candidate tableEntry + for { + nextHash := hash(cv) + s = nextS + nextS = s + 1 + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidates := e.table[nextHash] + now := load3232(src, nextS) + e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} + + // Check both candidates + candidate = candidates.Cur + offset := s - (candidate.offset - e.cur) + if cv == candidate.val { + if offset > maxMatchOffset { + cv = now + // Previous will also be invalid, we have nothing. + continue + } + o2 := s - (candidates.Prev.offset - e.cur) + if cv != candidates.Prev.val || o2 > maxMatchOffset { + break + } + // Both match and are valid, pick longest. + l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:]) + if l2 > l1 { + candidate = candidates.Prev + } + break + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + break + } + } + } + cv = now + } + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + // + t := candidate.offset - e.cur + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + t += l + // Index first pair after match end. + if int(t+4) < len(src) && t > 0 { + cv := load3232(src, t) + nextHash := hash(cv) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + t, val: cv}, + } + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-3 to s. + x := load6432(src, s-3) + prevHash := hash(uint32(x)) + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 3, val: uint32(x)}, + } + x >>= 8 + prevHash = hash(uint32(x)) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, + } + x >>= 8 + prevHash = hash(uint32(x)) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, + } + x >>= 8 + currHash := hash(uint32(x)) + candidates := e.table[currHash] + cv = uint32(x) + e.table[currHash] = tableEntryPrev{ + Prev: candidates.Cur, + Cur: tableEntry{offset: s + e.cur, val: cv}, + } + + // Check both candidates + candidate = candidates.Cur + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + continue + } + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if cv == candidate.val { + offset := s - (candidate.offset - e.cur) + if offset <= maxMatchOffset { + continue + } + } + } + cv = uint32(x >> 8) + s++ + break + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go new file mode 100644 index 0000000000..c689ac771b --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level4.go @@ -0,0 +1,210 @@ +package flate + +import "fmt" + +type fastEncL4 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntry +} + +func (e *fastEncL4) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.bTable[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + e.bTable[nextHashL] = entry + + t = lCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.val { + // We got a long match. Use that. + break + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + lCandidate = e.bTable[hash7(next, tableBits)] + + // If the next long is a candidate, check if we should use that instead... + lOff := nextS - (lCandidate.offset - e.cur) + if lOff < maxMatchOffset && lCandidate.val == uint32(next) { + l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:]) + if l2 > l1 { + s = nextS + t = lCandidate.offset - e.cur + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(s+4, t+4, src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic("s-t") + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+8) < len(src) { + cv := load6432(src, s) + e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)} + } + goto emitRemainder + } + + // Store every 3rd hash in-between + if true { + i := nextS + if i < s-1 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hash4u(t2.val, tableBits)] = t2 + + i += 3 + for ; i < s-1; i += 3 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1} + e.bTable[hash7(cv, tableBits)] = t + e.bTable[hash7(cv>>8, tableBits)] = t2 + e.table[hash4u(t2.val, tableBits)] = t2 + } + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := load6432(src, s-1) + o := e.cur + s - 1 + prevHashS := hash4x64(x, tableBits) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} + e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)} + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go new file mode 100644 index 0000000000..14a2356126 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level5.go @@ -0,0 +1,276 @@ +package flate + +import "fmt" + +type fastEncL5 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL5) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + nextHashS = hash4x64(next, tableBits) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == lCandidate.Cur.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + l = e.matchlen(s+4, t+4, src) + 4 + ml1 := e.matchlen(s+4, t2+4, src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + l = e.matchlen(s+4, t+4, src) + 4 + lCandidate = e.bTable[nextHashL] + // Store the next match + + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // If the next long is a candidate, use that... + t2 := lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if lCandidate.Cur.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(s+l, t+l, src) + } + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic(fmt.Sprintln("s-t", s, t)) + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", s-t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + goto emitRemainder + } + + // Store every 3rd hash in-between. + if true { + const hashEvery = 3 + i := s - l + 1 + if i < s-1 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + e.table[hash4x64(cv, tableBits)] = t + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // Do an long at i+1 + cv >>= 8 + t = tableEntry{offset: t.offset + 1, val: uint32(cv)} + eLong = &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // We only have enough bits for a short entry at i+2 + cv >>= 8 + t = tableEntry{offset: t.offset + 1, val: uint32(cv)} + e.table[hash4x64(cv, tableBits)] = t + + // Skip one - otherwise we risk hitting 's' + i += 4 + for ; i < s-1; i += hashEvery { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = t, eLong.Cur + e.table[hash4u(t2.val, tableBits)] = t2 + } + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := load6432(src, s-1) + o := e.cur + s - 1 + prevHashS := hash4x64(x, tableBits) + prevHashL := hash7(x, tableBits) + e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)} + eLong := &e.bTable[prevHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go new file mode 100644 index 0000000000..cad0c7df7f --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/level6.go @@ -0,0 +1,279 @@ +package flate + +import "fmt" + +type fastEncL6 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL6) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load6432(src, s) + // Repeat MUST be > 1 and within range + repeat := int32(1) + for { + const skipLog = 7 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hash4x64(cv, tableBits) + nextHashL := hash7(cv, tableBits) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := load6432(src, nextS) + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + // Calculate hashes of 'next' + nextHashS = hash4x64(next, tableBits) + nextHashL = hash7(next, tableBits) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == lCandidate.Cur.val { + // Long candidate matches at least 4 bytes. + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // Check the previous long candidate as well. + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + l = e.matchlen(s+4, t+4, src) + 4 + ml1 := e.matchlen(s+4, t2+4, src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + // Current value did not match, but check if previous long value does. + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == sCandidate.val { + // Found a 4 match... + l = e.matchlen(s+4, t+4, src) + 4 + + // Look up next long candidate (at nextS) + lCandidate = e.bTable[nextHashL] + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur + + // Check repeat at s + repOff + const repOff = 1 + t2 := s - repeat + repOff + if load3232(src, t2) == uint32(cv>>(8*repOff)) { + ml := e.matchlen(s+4+repOff, t2+4, src) + 4 + if ml > l { + t = t2 + l = ml + s += repOff + // Not worth checking more. + break + } + } + + // If the next long is a candidate, use that... + t2 = lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if lCandidate.Cur.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + // This is ok, but check previous as well. + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) { + ml := e.matchlen(nextS+4, t2+4, src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(s+l, t+l, src) + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + if false { + if t >= s { + panic(fmt.Sprintln("s-t", s, t)) + } + if (s - t) > maxMatchOffset { + panic(fmt.Sprintln("mmo", s-t)) + } + if l < baseMatchLength { + panic("bml") + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + repeat = s - t + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index after match end. + for i := nextS + 1; i < int32(len(src))-8; i += 2 { + cv := load6432(src, i) + e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur + } + goto emitRemainder + } + + // Store every long hash in-between and every second short. + if true { + for i := nextS + 1; i < s-1; i += 2 { + cv := load6432(src, i) + t := tableEntry{offset: i + e.cur, val: uint32(cv)} + t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)} + eLong := &e.bTable[hash7(cv, tableBits)] + eLong2 := &e.bTable[hash7(cv>>8, tableBits)] + e.table[hash4x64(cv, tableBits)] = t + eLong.Cur, eLong.Prev = t, eLong.Cur + eLong2.Cur, eLong2.Prev = t2, eLong2.Cur + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + cv = load6432(src, s) + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go deleted file mode 100644 index c1a02720d1..0000000000 --- a/vendor/github.com/klauspost/compress/flate/reverse_bits.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package flate - -var reverseByte = [256]byte{ - 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, - 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, - 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, - 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, - 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, - 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, - 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, - 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, - 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, - 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, - 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, - 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, - 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, - 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, - 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, - 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, - 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, - 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, - 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, - 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, - 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, - 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, - 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, - 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, - 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, - 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, - 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, - 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, - 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, - 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, - 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, - 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, -} - -func reverseUint16(v uint16) uint16 { - return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8 -} - -func reverseBits(number uint16, bitLength byte) uint16 { - return reverseUint16(number << uint8(16-bitLength)) -} diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go deleted file mode 100644 index 0bbd946c01..0000000000 --- a/vendor/github.com/klauspost/compress/flate/snappy.go +++ /dev/null @@ -1,856 +0,0 @@ -// Copyright 2011 The Snappy-Go Authors. All rights reserved. -// Modified for deflate by Klaus Post (c) 2015. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package flate - -// emitLiteral writes a literal chunk and returns the number of bytes written. -func emitLiteral(dst *tokens, lit []byte) { - ol := int(dst.n) - for i, v := range lit { - dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) - } - dst.n += uint16(len(lit)) -} - -// emitCopy writes a copy chunk and returns the number of bytes written. -func emitCopy(dst *tokens, offset, length int) { - dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize)) - dst.n++ -} - -type snappyEnc interface { - Encode(dst *tokens, src []byte) - Reset() -} - -func newSnappy(level int) snappyEnc { - switch level { - case 1: - return &snappyL1{} - case 2: - return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} - case 3: - return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}} - case 4: - return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}} - default: - panic("invalid level specified") - } -} - -const ( - tableBits = 14 // Bits used in the table - tableSize = 1 << tableBits // Size of the table - tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. - tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. - baseMatchOffset = 1 // The smallest match offset - baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 - maxMatchOffset = 1 << 15 // The largest match offset -) - -func load32(b []byte, i int) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load64(b []byte, i int) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 -} - -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} - -// snappyL1 encapsulates level 1 compression -type snappyL1 struct{} - -func (e *snappyL1) Reset() {} - -func (e *snappyL1) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - return - } - - // Initialize the hash table. - // - // The table element type is uint16, as s < sLimit and sLimit < len(src) - // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535. - var table [tableSize]uint16 - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - nextHash := hash(load32(src, s)) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := 32 - - nextS := s - candidate := 0 - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = int(table[nextHash&tableMask]) - table[nextHash&tableMask] = uint16(s) - nextHash = hash(load32(src, nextS)) - // TODO: < should be <=, and add a test for that. - if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) { - break - } - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - base := s - - // Extend the 4-byte match as long as possible. - // - // This is an inlined version of Snappy's: - // s = extendMatch(src, candidate+4, s+4) - s += 4 - s1 := base + maxMatchLength - if s1 > len(src) { - s1 = len(src) - } - a := src[s:s1] - b := src[candidate+4:] - b = b[:len(a)] - l := len(a) - for i := range a { - if a[i] != b[i] { - l = i - break - } - } - s += l - - // matchToken is flate's equivalent of Snappy's emitCopy. - dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)) - dst.n++ - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load64(src, s-1) - prevHash := hash(uint32(x >> 0)) - table[prevHash&tableMask] = uint16(s - 1) - currHash := hash(uint32(x >> 8)) - candidate = int(table[currHash&tableMask]) - table[currHash&tableMask] = uint16(s) - // TODO: >= should be >, and add a test for that. - if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) { - nextHash = hash(uint32(x >> 16)) - s++ - break - } - } - } - -emitRemainder: - if nextEmit < len(src) { - emitLiteral(dst, src[nextEmit:]) - } -} - -type tableEntry struct { - val uint32 - offset int32 -} - -func load3232(b []byte, i int32) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load6432(b []byte, i int32) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 -} - -// snappyGen maintains the table for matches, -// and the previous byte block for level 2. -// This is the generic implementation. -type snappyGen struct { - prev []byte - cur int32 -} - -// snappyGen maintains the table for matches, -// and the previous byte block for level 2. -// This is the generic implementation. -type snappyL2 struct { - snappyGen - table [tableSize]tableEntry -} - -// EncodeL2 uses a similar algorithm to level 1, but is capable -// of matching across blocks giving better compression at a small slowdown. -func (e *snappyL2) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits. - if e.cur > 1<<30 { - for i := range e.table { - e.table[i] = tableEntry{} - } - e.cur = maxStoreBlockSize - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv} - nextHash = hash(now) - - offset := s - (candidate.offset - e.cur) - if offset >= maxMatchOffset || cv != candidate.val { - // Out of range or not matched. - cv = now - continue - } - break - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-1) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)} - x >>= 8 - currHash := hash(uint32(x)) - candidate = e.table[currHash&tableMask] - e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)} - - offset := s - (candidate.offset - e.cur) - if offset >= maxMatchOffset || uint32(x) != candidate.val { - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -type tableEntryPrev struct { - Cur tableEntry - Prev tableEntry -} - -// snappyL3 -type snappyL3 struct { - snappyGen - table [tableSize]tableEntryPrev -} - -// Encode uses a similar algorithm to level 2, will check up to two candidates. -func (e *snappyL3) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - ) - - // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits. - if e.cur > 1<<30 { - for i := range e.table { - e.table[i] = tableEntryPrev{} - } - e.cur = maxStoreBlockSize - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidates := e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} - nextHash = hash(now) - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - break - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - break - } - } - } - cv = now - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-2, s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-2) - prevHash := hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, - } - x >>= 8 - currHash := hash(uint32(x)) - candidates := e.table[currHash&tableMask] - cv = uint32(x) - e.table[currHash&tableMask] = tableEntryPrev{ - Prev: candidates.Cur, - Cur: tableEntry{offset: s + e.cur, val: cv}, - } - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - continue - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - continue - } - } - } - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -// snappyL4 -type snappyL4 struct { - snappyL3 -} - -// Encode uses a similar algorithm to level 3, -// but will check up to two candidates if first isn't long enough. -func (e *snappyL4) Encode(dst *tokens, src []byte) { - const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin - matchLenGood = 12 - ) - - // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits. - if e.cur > 1<<30 { - for i := range e.table { - e.table[i] = tableEntryPrev{} - } - e.cur = maxStoreBlockSize - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - // We do not fill the token table. - // This will be picked up by caller. - dst.n = uint16(len(src)) - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load3232(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - var candidateAlt tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidates := e.table[nextHash&tableMask] - now := load3232(src, nextS) - e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}} - nextHash = hash(now) - - // Check both candidates - candidate = candidates.Cur - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - offset = s - (candidates.Prev.offset - e.cur) - if cv == candidates.Prev.val && offset < maxMatchOffset { - candidateAlt = candidates.Prev - } - break - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - break - } - } - } - cv = now - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchlen(s, t, src) - // Try alternative candidate if match length < matchLenGood. - if l < matchLenGood-4 && candidateAlt.offset != 0 { - t2 := candidateAlt.offset - e.cur + 4 - l2 := e.matchlen(s, t2, src) - if l2 > l { - l = l2 - t = t2 - } - } - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)) - dst.n++ - s += l - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-2, s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load6432(src, s-2) - prevHash := hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 2, val: uint32(x)}, - } - x >>= 8 - prevHash = hash(uint32(x)) - - e.table[prevHash&tableMask] = tableEntryPrev{ - Prev: e.table[prevHash&tableMask].Cur, - Cur: tableEntry{offset: e.cur + s - 1, val: uint32(x)}, - } - x >>= 8 - currHash := hash(uint32(x)) - candidates := e.table[currHash&tableMask] - cv = uint32(x) - e.table[currHash&tableMask] = tableEntryPrev{ - Prev: candidates.Cur, - Cur: tableEntry{offset: s + e.cur, val: cv}, - } - - // Check both candidates - candidate = candidates.Cur - candidateAlt = tableEntry{} - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - offset = s - (candidates.Prev.offset - e.cur) - if cv == candidates.Prev.val && offset < maxMatchOffset { - candidateAlt = candidates.Prev - } - continue - } - } else { - // We only check if value mismatches. - // Offset will always be invalid in other cases. - candidate = candidates.Prev - if cv == candidate.val { - offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset { - continue - } - } - } - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break - } - } - -emitRemainder: - if int(nextEmit) < len(src) { - emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) -} - -func (e *snappyGen) matchlen(s, t int32, src []byte) int32 { - s1 := int(s) + maxMatchLength - 4 - if s1 > len(src) { - s1 = len(src) - } - - // If we are inside the current block - if t >= 0 { - b := src[t:] - a := src[s:s1] - b = b[:len(a)] - // Extend the match to be as long as possible. - for i := range a { - if a[i] != b[i] { - return int32(i) - } - } - return int32(len(a)) - } - - // We found a match in the previous block. - tp := int32(len(e.prev)) + t - if tp < 0 { - return 0 - } - - // Extend the match to be as long as possible. - a := src[s:s1] - b := e.prev[tp:] - if len(b) > len(a) { - b = b[:len(a)] - } - a = a[:len(b)] - for i := range b { - if a[i] != b[i] { - return int32(i) - } - } - n := int32(len(b)) - a = src[s+n : s1] - b = src[:len(a)] - for i := range a { - if a[i] != b[i] { - return int32(i) + n - } - } - return int32(len(a)) + n -} - -// Reset the encoding table. -func (e *snappyGen) Reset() { - e.prev = e.prev[:0] - e.cur += maxMatchOffset + 1 -} diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go new file mode 100644 index 0000000000..524ee0ae37 --- /dev/null +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -0,0 +1,252 @@ +package flate + +import ( + "io" + "math" +) + +const ( + maxStatelessBlock = math.MaxInt16 + + slTableBits = 13 + slTableSize = 1 << slTableBits + slTableShift = 32 - slTableBits +) + +type statelessWriter struct { + dst io.Writer + closed bool +} + +func (s *statelessWriter) Close() error { + if s.closed { + return nil + } + s.closed = true + // Emit EOF block + return StatelessDeflate(s.dst, nil, true) +} + +func (s *statelessWriter) Write(p []byte) (n int, err error) { + err = StatelessDeflate(s.dst, p, false) + if err != nil { + return 0, err + } + return len(p), nil +} + +func (s *statelessWriter) Reset(w io.Writer) { + s.dst = w + s.closed = false +} + +// NewStatelessWriter will do compression but without maintaining any state +// between Write calls. +// There will be no memory kept between Write calls, +// but compression and speed will be suboptimal. +// Because of this, the size of actual Write calls will affect output size. +func NewStatelessWriter(dst io.Writer) io.WriteCloser { + return &statelessWriter{dst: dst} +} + +// StatelessDeflate allows to compress directly to a Writer without retaining state. +// When returning everything will be flushed. +func StatelessDeflate(out io.Writer, in []byte, eof bool) error { + var dst tokens + bw := newHuffmanBitWriter(out) + if eof && len(in) == 0 { + // Just write an EOF block. + // Could be faster... + bw.writeStoredHeader(0, true) + bw.flush() + return bw.err + } + + for len(in) > 0 { + todo := in + if len(todo) > maxStatelessBlock { + todo = todo[:maxStatelessBlock] + } + in = in[len(todo):] + // Compress + statelessEnc(&dst, todo) + isEof := eof && len(in) == 0 + + if dst.n == 0 { + bw.writeStoredHeader(len(todo), isEof) + if bw.err != nil { + return bw.err + } + bw.writeBytes(todo) + } else if int(dst.n) > len(todo)-len(todo)>>4 { + // If we removed less than 1/16th, huffman compress the block. + bw.writeBlockHuff(isEof, todo, false) + } else { + bw.writeBlockDynamic(&dst, isEof, todo, false) + } + if bw.err != nil { + return bw.err + } + dst.Reset() + } + if !eof { + // Align. + bw.writeStoredHeader(0, false) + } + bw.flush() + return bw.err +} + +func hashSL(u uint32) uint32 { + return (u * 0x1e35a7bd) >> slTableShift +} + +func load3216(b []byte, i int16) uint32 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load6416(b []byte, i int16) uint64 { + // Help the compiler eliminate bounds checks on the read so it can be done in a single read. + b = b[i:] + b = b[:8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func statelessEnc(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + + type tableEntry struct { + offset int16 + } + + var table [slTableSize]tableEntry + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + s := int16(1) + nextEmit := int16(0) + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := int16(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiteral should start from. + cv := load3216(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hashSL(cv) + candidate = table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit || nextS <= 0 { + goto emitRemainder + } + + now := load6416(src, nextS) + table[nextHash] = tableEntry{offset: s} + nextHash = hashSL(uint32(now)) + + if cv == load3216(src, candidate.offset) { + table[nextHash] = tableEntry{offset: nextS} + break + } + + // Do one right away... + cv = uint32(now) + s = nextS + nextS++ + candidate = table[nextHash] + now >>= 8 + table[nextHash] = tableEntry{offset: s} + + if cv == load3216(src, candidate.offset) { + table[nextHash] = tableEntry{offset: nextS} + break + } + cv = uint32(now) + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + t := candidate.offset + l := int16(matchLen(src[s+4:], src[t+4:]) + 4) + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + emitLiteral(dst, src[nextEmit:s]) + } + + // Save the match found + dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + if s >= sLimit { + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load6416(src, s-2) + o := s - 2 + prevHash := hashSL(uint32(x)) + table[prevHash] = tableEntry{offset: o} + x >>= 16 + currHash := hashSL(uint32(x)) + candidate = table[currHash] + table[currHash] = tableEntry{offset: o + 2} + + if uint32(x) != load3216(src, candidate.offset) { + cv = uint32(x >> 8) + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiteral(dst, src[nextEmit:]) + } +} diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go index 4f275ea61d..b3df0d8941 100644 --- a/vendor/github.com/klauspost/compress/flate/token.go +++ b/vendor/github.com/klauspost/compress/flate/token.go @@ -4,7 +4,13 @@ package flate -import "fmt" +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "math" +) const ( // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused @@ -19,7 +25,7 @@ const ( // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH) // is lengthCodes[length - MIN_MATCH_LENGTH] -var lengthCodes = [...]uint32{ +var lengthCodes = [256]uint8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, @@ -48,7 +54,37 @@ var lengthCodes = [...]uint32{ 27, 27, 27, 27, 27, 28, } -var offsetCodes = [...]uint32{ +// lengthCodes1 is length codes, but starting at 1. +var lengthCodes1 = [256]uint8{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, + 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, + 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 29, +} + +var offsetCodes = [256]uint32{ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, @@ -67,49 +103,265 @@ var offsetCodes = [...]uint32{ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, } +// offsetCodes14 are offsetCodes, but with 14 added. +var offsetCodes14 = [256]uint32{ + 14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, +} + type token uint32 type tokens struct { - tokens [maxStoreBlockSize + 1]token - n uint16 // Must be able to contain maxStoreBlockSize + nLits int + extraHist [32]uint16 // codes 256->maxnumlit + offHist [32]uint16 // offset codes + litHist [256]uint16 // codes 0->255 + n uint16 // Must be able to contain maxStoreBlockSize + tokens [maxStoreBlockSize + 1]token +} + +func (t *tokens) Reset() { + if t.n == 0 { + return + } + t.n = 0 + t.nLits = 0 + for i := range t.litHist[:] { + t.litHist[i] = 0 + } + for i := range t.extraHist[:] { + t.extraHist[i] = 0 + } + for i := range t.offHist[:] { + t.offHist[i] = 0 + } +} + +func (t *tokens) Fill() { + if t.n == 0 { + return + } + for i, v := range t.litHist[:] { + if v == 0 { + t.litHist[i] = 1 + t.nLits++ + } + } + for i, v := range t.extraHist[:literalCount-256] { + if v == 0 { + t.nLits++ + t.extraHist[i] = 1 + } + } + for i, v := range t.offHist[:offsetCodeCount] { + if v == 0 { + t.offHist[i] = 1 + } + } } -// Convert a literal into a literal token. -func literalToken(literal uint32) token { return token(literalType + literal) } +func indexTokens(in []token) tokens { + var t tokens + t.indexTokens(in) + return t +} + +func (t *tokens) indexTokens(in []token) { + t.Reset() + for _, tok := range in { + if tok < matchType { + t.tokens[t.n] = tok + t.litHist[tok]++ + t.n++ + continue + } + t.AddMatch(uint32(tok.length()), tok.offset()) + } +} + +// emitLiteral writes a literal chunk and returns the number of bytes written. +func emitLiteral(dst *tokens, lit []byte) { + ol := int(dst.n) + for i, v := range lit { + dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) + dst.litHist[v]++ + } + dst.n += uint16(len(lit)) + dst.nLits += len(lit) +} -// Convert a < xlength, xoffset > pair into a match token. -func matchToken(xlength uint32, xoffset uint32) token { - return token(matchType + xlength<<lengthShift + xoffset) +func (t *tokens) AddLiteral(lit byte) { + t.tokens[t.n] = token(lit) + t.litHist[lit]++ + t.n++ + t.nLits++ } -func matchTokend(xlength uint32, xoffset uint32) token { - if xlength > maxMatchLength || xoffset > maxMatchOffset { - panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset)) - return token(matchType) +// EstimatedBits will return an minimum size estimated by an *optimal* +// compression of the block. +// The size of the block +func (t *tokens) EstimatedBits() int { + shannon := float64(0) + bits := int(0) + nMatches := 0 + if t.nLits > 0 { + invTotal := 1.0 / float64(t.nLits) + for _, v := range t.litHist[:] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + } + } + // Just add 15 for EOB + shannon += 15 + for _, v := range t.extraHist[1 : literalCount-256] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + bits += int(lengthExtraBits[v&31]) * int(v) + nMatches += int(v) + } + } } - return token(matchType + xlength<<lengthShift + xoffset) + if nMatches > 0 { + invTotal := 1.0 / float64(nMatches) + for _, v := range t.offHist[:offsetCodeCount] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) + bits += int(offsetExtraBits[v&31]) * int(n) + } + } + } + + return int(shannon) + bits +} + +// AddMatch adds a match to the tokens. +// This function is very sensitive to inlining and right on the border. +func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { + if debugDecode { + if xlength >= maxMatchLength+baseMatchLength { + panic(fmt.Errorf("invalid length: %v", xlength)) + } + if xoffset >= maxMatchOffset+baseMatchOffset { + panic(fmt.Errorf("invalid offset: %v", xoffset)) + } + } + t.nLits++ + lengthCode := lengthCodes1[uint8(xlength)] & 31 + t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset) + t.extraHist[lengthCode]++ + t.offHist[offsetCode(xoffset)&31]++ + t.n++ +} + +// AddMatchLong adds a match to the tokens, potentially longer than max match length. +// Length should NOT have the base subtracted, only offset should. +func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) { + if debugDecode { + if xoffset >= maxMatchOffset+baseMatchOffset { + panic(fmt.Errorf("invalid offset: %v", xoffset)) + } + } + oc := offsetCode(xoffset) & 31 + for xlength > 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + xl = 258 - baseMatchLength + } + xlength -= xl + xl -= 3 + t.nLits++ + lengthCode := lengthCodes1[uint8(xl)] & 31 + t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) + t.extraHist[lengthCode]++ + t.offHist[oc]++ + t.n++ + } +} + +func (t *tokens) AddEOB() { + t.tokens[t.n] = token(endBlockMarker) + t.extraHist[0]++ + t.n++ +} + +func (t *tokens) Slice() []token { + return t.tokens[:t.n] +} + +// VarInt returns the tokens as varint encoded bytes. +func (t *tokens) VarInt() []byte { + var b = make([]byte, binary.MaxVarintLen32*int(t.n)) + var off int + for _, v := range t.tokens[:t.n] { + off += binary.PutUvarint(b[off:], uint64(v)) + } + return b[:off] +} + +// FromVarInt restores t to the varint encoded tokens provided. +// Any data in t is removed. +func (t *tokens) FromVarInt(b []byte) error { + var buf = bytes.NewReader(b) + var toks []token + for { + r, err := binary.ReadUvarint(buf) + if err == io.EOF { + break + } + if err != nil { + return err + } + toks = append(toks, token(r)) + } + t.indexTokens(toks) + return nil } // Returns the type of a token func (t token) typ() uint32 { return uint32(t) & typeMask } // Returns the literal of a literal token -func (t token) literal() uint32 { return uint32(t - literalType) } +func (t token) literal() uint8 { return uint8(t) } // Returns the extra offset of a match token func (t token) offset() uint32 { return uint32(t) & offsetMask } -func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) } +func (t token) length() uint8 { return uint8(t >> lengthShift) } -func lengthCode(len uint32) uint32 { return lengthCodes[len] } +// The code is never more than 8 bits, but is returned as uint32 for convenience. +func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) } // Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { + if false { + if off < uint32(len(offsetCodes)) { + return offsetCodes[off&255] + } else if off>>7 < uint32(len(offsetCodes)) { + return offsetCodes[(off>>7)&255] + 14 + } else { + return offsetCodes[(off>>14)&255] + 28 + } + } if off < uint32(len(offsetCodes)) { - return offsetCodes[off] - } else if off>>7 < uint32(len(offsetCodes)) { - return offsetCodes[off>>7] + 14 - } else { - return offsetCodes[off>>14] + 28 + return offsetCodes[uint8(off)] } + return offsetCodes14[uint8(off>>7)] } diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go index e73fab3f0f..568b5d4fb8 100644 --- a/vendor/github.com/klauspost/compress/gzip/gunzip.go +++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go @@ -10,11 +10,11 @@ import ( "bufio" "encoding/binary" "errors" + "hash/crc32" "io" "time" "github.com/klauspost/compress/flate" - "github.com/klauspost/crc32" ) const ( diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go index a0f3ed0fcf..ed0cc148f8 100644 --- a/vendor/github.com/klauspost/compress/gzip/gzip.go +++ b/vendor/github.com/klauspost/compress/gzip/gzip.go @@ -7,10 +7,10 @@ package gzip import ( "errors" "fmt" + "hash/crc32" "io" "github.com/klauspost/compress/flate" - "github.com/klauspost/crc32" ) // These constants are copied from the flate package, so that code that imports @@ -22,6 +22,13 @@ const ( DefaultCompression = flate.DefaultCompression ConstantCompression = flate.ConstantCompression HuffmanOnly = flate.HuffmanOnly + + // StatelessCompression will do compression but without maintaining any state + // between Write calls. + // There will be no memory kept between Write calls, + // but compression and speed will be suboptimal. + // Because of this, the size of actual Write calls will affect output size. + StatelessCompression = -3 ) // A Writer is an io.WriteCloser. @@ -59,7 +66,7 @@ func NewWriter(w io.Writer) *Writer { // integer value between BestSpeed and BestCompression inclusive. The error // returned will be nil if the level is valid. func NewWriterLevel(w io.Writer, level int) (*Writer, error) { - if level < HuffmanOnly || level > BestCompression { + if level < StatelessCompression || level > BestCompression { return nil, fmt.Errorf("gzip: invalid compression level: %d", level) } z := new(Writer) @@ -69,9 +76,12 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) { func (z *Writer) init(w io.Writer, level int) { compressor := z.compressor - if compressor != nil { - compressor.Reset(w) + if level != StatelessCompression { + if compressor != nil { + compressor.Reset(w) + } } + *z = Writer{ Header: Header{ OS: 255, // unknown @@ -189,12 +199,16 @@ func (z *Writer) Write(p []byte) (int, error) { return n, z.err } } - if z.compressor == nil { + + if z.compressor == nil && z.level != StatelessCompression { z.compressor, _ = flate.NewWriter(z.w, z.level) } } z.size += uint32(len(p)) z.digest = crc32.Update(z.digest, crc32.IEEETable, p) + if z.level == StatelessCompression { + return len(p), flate.StatelessDeflate(z.w, p, false) + } n, z.err = z.compressor.Write(p) return n, z.err } @@ -211,7 +225,7 @@ func (z *Writer) Flush() error { if z.err != nil { return z.err } - if z.closed { + if z.closed || z.level == StatelessCompression { return nil } if !z.wroteHeader { @@ -240,7 +254,11 @@ func (z *Writer) Close() error { return z.err } } - z.err = z.compressor.Close() + if z.level == StatelessCompression { + z.err = flate.StatelessDeflate(z.w, nil, true) + } else { + z.err = z.compressor.Close() + } if z.err != nil { return z.err } diff --git a/vendor/github.com/klauspost/cpuid/.gitignore b/vendor/github.com/klauspost/cpuid/.gitignore deleted file mode 100644 index daf913b1b3..0000000000 --- a/vendor/github.com/klauspost/cpuid/.gitignore +++ /dev/null @@ -1,24 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof diff --git a/vendor/github.com/klauspost/cpuid/.travis.yml b/vendor/github.com/klauspost/cpuid/.travis.yml deleted file mode 100644 index bde823d8ab..0000000000 --- a/vendor/github.com/klauspost/cpuid/.travis.yml +++ /dev/null @@ -1,8 +0,0 @@ -language: go - -go: - - 1.3 - - 1.4 - - 1.5 - - 1.6 - - tip diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE deleted file mode 100644 index 5cec7ee949..0000000000 --- a/vendor/github.com/klauspost/cpuid/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Klaus Post - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/vendor/github.com/klauspost/cpuid/README.md b/vendor/github.com/klauspost/cpuid/README.md deleted file mode 100644 index b2b6bee879..0000000000 --- a/vendor/github.com/klauspost/cpuid/README.md +++ /dev/null @@ -1,145 +0,0 @@ -# cpuid -Package cpuid provides information about the CPU running the current program. - -CPU features are detected on startup, and kept for fast access through the life of the application. -Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use. - -You can access the CPU information by accessing the shared CPU variable of the cpuid library. - -Package home: https://github.com/klauspost/cpuid - -[![GoDoc][1]][2] [![Build Status][3]][4] - -[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg -[2]: https://godoc.org/github.com/klauspost/cpuid -[3]: https://travis-ci.org/klauspost/cpuid.svg -[4]: https://travis-ci.org/klauspost/cpuid - -# features -## CPU Instructions -* **CMOV** (i686 CMOV) -* **NX** (NX (No-Execute) bit) -* **AMD3DNOW** (AMD 3DNOW) -* **AMD3DNOWEXT** (AMD 3DNowExt) -* **MMX** (standard MMX) -* **MMXEXT** (SSE integer functions or AMD MMX ext) -* **SSE** (SSE functions) -* **SSE2** (P4 SSE functions) -* **SSE3** (Prescott SSE3 functions) -* **SSSE3** (Conroe SSSE3 functions) -* **SSE4** (Penryn SSE4.1 functions) -* **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions) -* **SSE42** (Nehalem SSE4.2 functions) -* **AVX** (AVX functions) -* **AVX2** (AVX2 functions) -* **FMA3** (Intel FMA 3) -* **FMA4** (Bulldozer FMA4 functions) -* **XOP** (Bulldozer XOP functions) -* **F16C** (Half-precision floating-point conversion) -* **BMI1** (Bit Manipulation Instruction Set 1) -* **BMI2** (Bit Manipulation Instruction Set 2) -* **TBM** (AMD Trailing Bit Manipulation) -* **LZCNT** (LZCNT instruction) -* **POPCNT** (POPCNT instruction) -* **AESNI** (Advanced Encryption Standard New Instructions) -* **CLMUL** (Carry-less Multiplication) -* **HTT** (Hyperthreading (enabled)) -* **HLE** (Hardware Lock Elision) -* **RTM** (Restricted Transactional Memory) -* **RDRAND** (RDRAND instruction is available) -* **RDSEED** (RDSEED instruction is available) -* **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions)) -* **SHA** (Intel SHA Extensions) -* **AVX512F** (AVX-512 Foundation) -* **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions) -* **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions) -* **AVX512PF** (AVX-512 Prefetch Instructions) -* **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions) -* **AVX512CD** (AVX-512 Conflict Detection Instructions) -* **AVX512BW** (AVX-512 Byte and Word Instructions) -* **AVX512VL** (AVX-512 Vector Length Extensions) -* **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions) -* **MPX** (Intel MPX (Memory Protection Extensions)) -* **ERMS** (Enhanced REP MOVSB/STOSB) -* **RDTSCP** (RDTSCP Instruction) -* **CX16** (CMPXCHG16B Instruction) -* **SGX** (Software Guard Extensions, with activation details) - -## Performance -* **RDTSCP()** Returns current cycle count. Can be used for benchmarking. -* **SSE2SLOW** (SSE2 is supported, but usually not faster) -* **SSE3SLOW** (SSE3 is supported, but usually not faster) -* **ATOM** (Atom processor, some SSSE3 instructions are slower) -* **Cache line** (Probable size of a cache line). -* **L1, L2, L3 Cache size** on newer Intel/AMD CPUs. - -## Cpu Vendor/VM -* **Intel** -* **AMD** -* **VIA** -* **Transmeta** -* **NSC** -* **KVM** (Kernel-based Virtual Machine) -* **MSVM** (Microsoft Hyper-V or Windows Virtual PC) -* **VMware** -* **XenHVM** - -# installing - -```go get github.com/klauspost/cpuid``` - -# example - -```Go -package main - -import ( - "fmt" - "github.com/klauspost/cpuid" -) - -func main() { - // Print basic CPU information: - fmt.Println("Name:", cpuid.CPU.BrandName) - fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores) - fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore) - fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores) - fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model) - fmt.Println("Features:", cpuid.CPU.Features) - fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine) - fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes") - fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes") - fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes") - fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes") - - // Test if we have a specific feature: - if cpuid.CPU.SSE() { - fmt.Println("We have Streaming SIMD Extensions") - } -} -``` - -Sample output: -``` ->go run main.go -Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz -PhysicalCores: 2 -ThreadsPerCore: 2 -LogicalCores: 4 -Family 6 Model: 42 -Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL -Cacheline bytes: 64 -We have Streaming SIMD Extensions -``` - -# private package - -In the "private" folder you can find an autogenerated version of the library you can include in your own packages. - -For this purpose all exports are removed, and functions and constants are lowercased. - -This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages. - -# license - -This code is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go deleted file mode 100644 index 9230ca5628..0000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid.go +++ /dev/null @@ -1,1022 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// Package cpuid provides information about the CPU running the current program. -// -// CPU features are detected on startup, and kept for fast access through the life of the application. -// Currently x86 / x64 (AMD64) is supported. -// -// You can access the CPU information by accessing the shared CPU variable of the cpuid library. -// -// Package home: https://github.com/klauspost/cpuid -package cpuid - -import "strings" - -// Vendor is a representation of a CPU vendor. -type Vendor int - -const ( - Other Vendor = iota - Intel - AMD - VIA - Transmeta - NSC - KVM // Kernel-based Virtual Machine - MSVM // Microsoft Hyper-V or Windows Virtual PC - VMware - XenHVM -) - -const ( - CMOV = 1 << iota // i686 CMOV - NX // NX (No-Execute) bit - AMD3DNOW // AMD 3DNOW - AMD3DNOWEXT // AMD 3DNowExt - MMX // standard MMX - MMXEXT // SSE integer functions or AMD MMX ext - SSE // SSE functions - SSE2 // P4 SSE functions - SSE3 // Prescott SSE3 functions - SSSE3 // Conroe SSSE3 functions - SSE4 // Penryn SSE4.1 functions - SSE4A // AMD Barcelona microarchitecture SSE4a instructions - SSE42 // Nehalem SSE4.2 functions - AVX // AVX functions - AVX2 // AVX2 functions - FMA3 // Intel FMA 3 - FMA4 // Bulldozer FMA4 functions - XOP // Bulldozer XOP functions - F16C // Half-precision floating-point conversion - BMI1 // Bit Manipulation Instruction Set 1 - BMI2 // Bit Manipulation Instruction Set 2 - TBM // AMD Trailing Bit Manipulation - LZCNT // LZCNT instruction - POPCNT // POPCNT instruction - AESNI // Advanced Encryption Standard New Instructions - CLMUL // Carry-less Multiplication - HTT // Hyperthreading (enabled) - HLE // Hardware Lock Elision - RTM // Restricted Transactional Memory - RDRAND // RDRAND instruction is available - RDSEED // RDSEED instruction is available - ADX // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - SHA // Intel SHA Extensions - AVX512F // AVX-512 Foundation - AVX512DQ // AVX-512 Doubleword and Quadword Instructions - AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF // AVX-512 Prefetch Instructions - AVX512ER // AVX-512 Exponential and Reciprocal Instructions - AVX512CD // AVX-512 Conflict Detection Instructions - AVX512BW // AVX-512 Byte and Word Instructions - AVX512VL // AVX-512 Vector Length Extensions - AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions - MPX // Intel MPX (Memory Protection Extensions) - ERMS // Enhanced REP MOVSB/STOSB - RDTSCP // RDTSCP Instruction - CX16 // CMPXCHG16B Instruction - SGX // Software Guard Extensions - - // Performance indicators - SSE2SLOW // SSE2 is supported, but usually not faster - SSE3SLOW // SSE3 is supported, but usually not faster - ATOM // Atom processor, some SSSE3 instructions are slower -) - -var flagNames = map[Flags]string{ - CMOV: "CMOV", // i686 CMOV - NX: "NX", // NX (No-Execute) bit - AMD3DNOW: "AMD3DNOW", // AMD 3DNOW - AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt - MMX: "MMX", // Standard MMX - MMXEXT: "MMXEXT", // SSE integer functions or AMD MMX ext - SSE: "SSE", // SSE functions - SSE2: "SSE2", // P4 SSE2 functions - SSE3: "SSE3", // Prescott SSE3 functions - SSSE3: "SSSE3", // Conroe SSSE3 functions - SSE4: "SSE4.1", // Penryn SSE4.1 functions - SSE4A: "SSE4A", // AMD Barcelona microarchitecture SSE4a instructions - SSE42: "SSE4.2", // Nehalem SSE4.2 functions - AVX: "AVX", // AVX functions - AVX2: "AVX2", // AVX functions - FMA3: "FMA3", // Intel FMA 3 - FMA4: "FMA4", // Bulldozer FMA4 functions - XOP: "XOP", // Bulldozer XOP functions - F16C: "F16C", // Half-precision floating-point conversion - BMI1: "BMI1", // Bit Manipulation Instruction Set 1 - BMI2: "BMI2", // Bit Manipulation Instruction Set 2 - TBM: "TBM", // AMD Trailing Bit Manipulation - LZCNT: "LZCNT", // LZCNT instruction - POPCNT: "POPCNT", // POPCNT instruction - AESNI: "AESNI", // Advanced Encryption Standard New Instructions - CLMUL: "CLMUL", // Carry-less Multiplication - HTT: "HTT", // Hyperthreading (enabled) - HLE: "HLE", // Hardware Lock Elision - RTM: "RTM", // Restricted Transactional Memory - RDRAND: "RDRAND", // RDRAND instruction is available - RDSEED: "RDSEED", // RDSEED instruction is available - ADX: "ADX", // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - SHA: "SHA", // Intel SHA Extensions - AVX512F: "AVX512F", // AVX-512 Foundation - AVX512DQ: "AVX512DQ", // AVX-512 Doubleword and Quadword Instructions - AVX512IFMA: "AVX512IFMA", // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF: "AVX512PF", // AVX-512 Prefetch Instructions - AVX512ER: "AVX512ER", // AVX-512 Exponential and Reciprocal Instructions - AVX512CD: "AVX512CD", // AVX-512 Conflict Detection Instructions - AVX512BW: "AVX512BW", // AVX-512 Byte and Word Instructions - AVX512VL: "AVX512VL", // AVX-512 Vector Length Extensions - AVX512VBMI: "AVX512VBMI", // AVX-512 Vector Bit Manipulation Instructions - MPX: "MPX", // Intel MPX (Memory Protection Extensions) - ERMS: "ERMS", // Enhanced REP MOVSB/STOSB - RDTSCP: "RDTSCP", // RDTSCP Instruction - CX16: "CX16", // CMPXCHG16B Instruction - SGX: "SGX", // Software Guard Extensions - - // Performance indicators - SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster - SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster - ATOM: "ATOM", // Atom processor, some SSSE3 instructions are slower - -} - -// CPUInfo contains information about the detected system CPU. -type CPUInfo struct { - BrandName string // Brand name reported by the CPU - VendorID Vendor // Comparable CPU vendor ID - Features Flags // Features of the CPU - PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. - ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable. - LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. - Family int // CPU family number - Model int // CPU model number - CacheLine int // Cache line size in bytes. Will be 0 if undetectable. - Cache struct { - L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected - L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected - L2 int // L2 Cache (per core or shared). Will be -1 if undetected - L3 int // L3 Instruction Cache (per core or shared). Will be -1 if undetected - } - SGX SGXSupport - maxFunc uint32 - maxExFunc uint32 -} - -var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) -var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) -var xgetbv func(index uint32) (eax, edx uint32) -var rdtscpAsm func() (eax, ebx, ecx, edx uint32) - -// CPU contains information about the CPU as detected on startup, -// or when Detect last was called. -// -// Use this as the primary entry point to you data, -// this way queries are -var CPU CPUInfo - -func init() { - initCPU() - Detect() -} - -// Detect will re-detect current CPU info. -// This will replace the content of the exported CPU variable. -// -// Unless you expect the CPU to change while you are running your program -// you should not need to call this function. -// If you call this, you must ensure that no other goroutine is accessing the -// exported CPU variable. -func Detect() { - CPU.maxFunc = maxFunctionID() - CPU.maxExFunc = maxExtendedFunction() - CPU.BrandName = brandName() - CPU.CacheLine = cacheLine() - CPU.Family, CPU.Model = familyModel() - CPU.Features = support() - CPU.SGX = sgx(CPU.Features&SGX != 0) - CPU.ThreadsPerCore = threadsPerCore() - CPU.LogicalCores = logicalCores() - CPU.PhysicalCores = physicalCores() - CPU.VendorID = vendorID() - CPU.cacheSize() -} - -// Generated here: http://play.golang.org/p/BxFH2Gdc0G - -// Cmov indicates support of CMOV instructions -func (c CPUInfo) Cmov() bool { - return c.Features&CMOV != 0 -} - -// Amd3dnow indicates support of AMD 3DNOW! instructions -func (c CPUInfo) Amd3dnow() bool { - return c.Features&AMD3DNOW != 0 -} - -// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions -func (c CPUInfo) Amd3dnowExt() bool { - return c.Features&AMD3DNOWEXT != 0 -} - -// MMX indicates support of MMX instructions -func (c CPUInfo) MMX() bool { - return c.Features&MMX != 0 -} - -// MMXExt indicates support of MMXEXT instructions -// (SSE integer functions or AMD MMX ext) -func (c CPUInfo) MMXExt() bool { - return c.Features&MMXEXT != 0 -} - -// SSE indicates support of SSE instructions -func (c CPUInfo) SSE() bool { - return c.Features&SSE != 0 -} - -// SSE2 indicates support of SSE 2 instructions -func (c CPUInfo) SSE2() bool { - return c.Features&SSE2 != 0 -} - -// SSE3 indicates support of SSE 3 instructions -func (c CPUInfo) SSE3() bool { - return c.Features&SSE3 != 0 -} - -// SSSE3 indicates support of SSSE 3 instructions -func (c CPUInfo) SSSE3() bool { - return c.Features&SSSE3 != 0 -} - -// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions -func (c CPUInfo) SSE4() bool { - return c.Features&SSE4 != 0 -} - -// SSE42 indicates support of SSE4.2 instructions -func (c CPUInfo) SSE42() bool { - return c.Features&SSE42 != 0 -} - -// AVX indicates support of AVX instructions -// and operating system support of AVX instructions -func (c CPUInfo) AVX() bool { - return c.Features&AVX != 0 -} - -// AVX2 indicates support of AVX2 instructions -func (c CPUInfo) AVX2() bool { - return c.Features&AVX2 != 0 -} - -// FMA3 indicates support of FMA3 instructions -func (c CPUInfo) FMA3() bool { - return c.Features&FMA3 != 0 -} - -// FMA4 indicates support of FMA4 instructions -func (c CPUInfo) FMA4() bool { - return c.Features&FMA4 != 0 -} - -// XOP indicates support of XOP instructions -func (c CPUInfo) XOP() bool { - return c.Features&XOP != 0 -} - -// F16C indicates support of F16C instructions -func (c CPUInfo) F16C() bool { - return c.Features&F16C != 0 -} - -// BMI1 indicates support of BMI1 instructions -func (c CPUInfo) BMI1() bool { - return c.Features&BMI1 != 0 -} - -// BMI2 indicates support of BMI2 instructions -func (c CPUInfo) BMI2() bool { - return c.Features&BMI2 != 0 -} - -// TBM indicates support of TBM instructions -// (AMD Trailing Bit Manipulation) -func (c CPUInfo) TBM() bool { - return c.Features&TBM != 0 -} - -// Lzcnt indicates support of LZCNT instruction -func (c CPUInfo) Lzcnt() bool { - return c.Features&LZCNT != 0 -} - -// Popcnt indicates support of POPCNT instruction -func (c CPUInfo) Popcnt() bool { - return c.Features&POPCNT != 0 -} - -// HTT indicates the processor has Hyperthreading enabled -func (c CPUInfo) HTT() bool { - return c.Features&HTT != 0 -} - -// SSE2Slow indicates that SSE2 may be slow on this processor -func (c CPUInfo) SSE2Slow() bool { - return c.Features&SSE2SLOW != 0 -} - -// SSE3Slow indicates that SSE3 may be slow on this processor -func (c CPUInfo) SSE3Slow() bool { - return c.Features&SSE3SLOW != 0 -} - -// AesNi indicates support of AES-NI instructions -// (Advanced Encryption Standard New Instructions) -func (c CPUInfo) AesNi() bool { - return c.Features&AESNI != 0 -} - -// Clmul indicates support of CLMUL instructions -// (Carry-less Multiplication) -func (c CPUInfo) Clmul() bool { - return c.Features&CLMUL != 0 -} - -// NX indicates support of NX (No-Execute) bit -func (c CPUInfo) NX() bool { - return c.Features&NX != 0 -} - -// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions -func (c CPUInfo) SSE4A() bool { - return c.Features&SSE4A != 0 -} - -// HLE indicates support of Hardware Lock Elision -func (c CPUInfo) HLE() bool { - return c.Features&HLE != 0 -} - -// RTM indicates support of Restricted Transactional Memory -func (c CPUInfo) RTM() bool { - return c.Features&RTM != 0 -} - -// Rdrand indicates support of RDRAND instruction is available -func (c CPUInfo) Rdrand() bool { - return c.Features&RDRAND != 0 -} - -// Rdseed indicates support of RDSEED instruction is available -func (c CPUInfo) Rdseed() bool { - return c.Features&RDSEED != 0 -} - -// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions) -func (c CPUInfo) ADX() bool { - return c.Features&ADX != 0 -} - -// SHA indicates support of Intel SHA Extensions -func (c CPUInfo) SHA() bool { - return c.Features&SHA != 0 -} - -// AVX512F indicates support of AVX-512 Foundation -func (c CPUInfo) AVX512F() bool { - return c.Features&AVX512F != 0 -} - -// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions -func (c CPUInfo) AVX512DQ() bool { - return c.Features&AVX512DQ != 0 -} - -// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions -func (c CPUInfo) AVX512IFMA() bool { - return c.Features&AVX512IFMA != 0 -} - -// AVX512PF indicates support of AVX-512 Prefetch Instructions -func (c CPUInfo) AVX512PF() bool { - return c.Features&AVX512PF != 0 -} - -// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions -func (c CPUInfo) AVX512ER() bool { - return c.Features&AVX512ER != 0 -} - -// AVX512CD indicates support of AVX-512 Conflict Detection Instructions -func (c CPUInfo) AVX512CD() bool { - return c.Features&AVX512CD != 0 -} - -// AVX512BW indicates support of AVX-512 Byte and Word Instructions -func (c CPUInfo) AVX512BW() bool { - return c.Features&AVX512BW != 0 -} - -// AVX512VL indicates support of AVX-512 Vector Length Extensions -func (c CPUInfo) AVX512VL() bool { - return c.Features&AVX512VL != 0 -} - -// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions -func (c CPUInfo) AVX512VBMI() bool { - return c.Features&AVX512VBMI != 0 -} - -// MPX indicates support of Intel MPX (Memory Protection Extensions) -func (c CPUInfo) MPX() bool { - return c.Features&MPX != 0 -} - -// ERMS indicates support of Enhanced REP MOVSB/STOSB -func (c CPUInfo) ERMS() bool { - return c.Features&ERMS != 0 -} - -func (c CPUInfo) RDTSCP() bool { - return c.Features&RDTSCP != 0 -} - -func (c CPUInfo) CX16() bool { - return c.Features&CX16 != 0 -} - -// Atom indicates an Atom processor -func (c CPUInfo) Atom() bool { - return c.Features&ATOM != 0 -} - -// Intel returns true if vendor is recognized as Intel -func (c CPUInfo) Intel() bool { - return c.VendorID == Intel -} - -// AMD returns true if vendor is recognized as AMD -func (c CPUInfo) AMD() bool { - return c.VendorID == AMD -} - -// Transmeta returns true if vendor is recognized as Transmeta -func (c CPUInfo) Transmeta() bool { - return c.VendorID == Transmeta -} - -// NSC returns true if vendor is recognized as National Semiconductor -func (c CPUInfo) NSC() bool { - return c.VendorID == NSC -} - -// VIA returns true if vendor is recognized as VIA -func (c CPUInfo) VIA() bool { - return c.VendorID == VIA -} - -// RTCounter returns the 64-bit time-stamp counter -// Uses the RDTSCP instruction. The value 0 is returned -// if the CPU does not support the instruction. -func (c CPUInfo) RTCounter() uint64 { - if !c.RDTSCP() { - return 0 - } - a, _, _, d := rdtscpAsm() - return uint64(a) | (uint64(d) << 32) -} - -// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. -// This variable is OS dependent, but on Linux contains information -// about the current cpu/core the code is running on. -// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. -func (c CPUInfo) Ia32TscAux() uint32 { - if !c.RDTSCP() { - return 0 - } - _, _, ecx, _ := rdtscpAsm() - return ecx -} - -// LogicalCPU will return the Logical CPU the code is currently executing on. -// This is likely to change when the OS re-schedules the running thread -// to another CPU. -// If the current core cannot be detected, -1 will be returned. -func (c CPUInfo) LogicalCPU() int { - if c.maxFunc < 1 { - return -1 - } - _, ebx, _, _ := cpuid(1) - return int(ebx >> 24) -} - -// VM Will return true if the cpu id indicates we are in -// a virtual machine. This is only a hint, and will very likely -// have many false negatives. -func (c CPUInfo) VM() bool { - switch c.VendorID { - case MSVM, KVM, VMware, XenHVM: - return true - } - return false -} - -// Flags contains detected cpu features and caracteristics -type Flags uint64 - -// String returns a string representation of the detected -// CPU features. -func (f Flags) String() string { - return strings.Join(f.Strings(), ",") -} - -// Strings returns and array of the detected features. -func (f Flags) Strings() []string { - s := support() - r := make([]string, 0, 20) - for i := uint(0); i < 64; i++ { - key := Flags(1 << i) - val := flagNames[key] - if s&key != 0 { - r = append(r, val) - } - } - return r -} - -func maxExtendedFunction() uint32 { - eax, _, _, _ := cpuid(0x80000000) - return eax -} - -func maxFunctionID() uint32 { - a, _, _, _ := cpuid(0) - return a -} - -func brandName() string { - if maxExtendedFunction() >= 0x80000004 { - v := make([]uint32, 0, 48) - for i := uint32(0); i < 3; i++ { - a, b, c, d := cpuid(0x80000002 + i) - v = append(v, a, b, c, d) - } - return strings.Trim(string(valAsString(v...)), " ") - } - return "unknown" -} - -func threadsPerCore() int { - mfi := maxFunctionID() - if mfi < 0x4 || vendorID() != Intel { - return 1 - } - - if mfi < 0xb { - _, b, _, d := cpuid(1) - if (d & (1 << 28)) != 0 { - // v will contain logical core count - v := (b >> 16) & 255 - if v > 1 { - a4, _, _, _ := cpuid(4) - // physical cores - v2 := (a4 >> 26) + 1 - if v2 > 0 { - return int(v) / int(v2) - } - } - } - return 1 - } - _, b, _, _ := cpuidex(0xb, 0) - if b&0xffff == 0 { - return 1 - } - return int(b & 0xffff) -} - -func logicalCores() int { - mfi := maxFunctionID() - switch vendorID() { - case Intel: - // Use this on old Intel processors - if mfi < 0xb { - if mfi < 1 { - return 0 - } - // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) - // that can be assigned to logical processors in a physical package. - // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. - _, ebx, _, _ := cpuid(1) - logical := (ebx >> 16) & 0xff - return int(logical) - } - _, b, _, _ := cpuidex(0xb, 1) - return int(b & 0xffff) - case AMD: - _, b, _, _ := cpuid(1) - return int((b >> 16) & 0xff) - default: - return 0 - } -} - -func familyModel() (int, int) { - if maxFunctionID() < 0x1 { - return 0, 0 - } - eax, _, _, _ := cpuid(1) - family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) - model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) - return int(family), int(model) -} - -func physicalCores() int { - switch vendorID() { - case Intel: - return logicalCores() / threadsPerCore() - case AMD: - if maxExtendedFunction() >= 0x80000008 { - _, _, c, _ := cpuid(0x80000008) - return int(c&0xff) + 1 - } - } - return 0 -} - -// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID -var vendorMapping = map[string]Vendor{ - "AMDisbetter!": AMD, - "AuthenticAMD": AMD, - "CentaurHauls": VIA, - "GenuineIntel": Intel, - "TransmetaCPU": Transmeta, - "GenuineTMx86": Transmeta, - "Geode by NSC": NSC, - "VIA VIA VIA ": VIA, - "KVMKVMKVMKVM": KVM, - "Microsoft Hv": MSVM, - "VMwareVMware": VMware, - "XenVMMXenVMM": XenHVM, -} - -func vendorID() Vendor { - _, b, c, d := cpuid(0) - v := valAsString(b, d, c) - vend, ok := vendorMapping[string(v)] - if !ok { - return Other - } - return vend -} - -func cacheLine() int { - if maxFunctionID() < 0x1 { - return 0 - } - - _, ebx, _, _ := cpuid(1) - cache := (ebx & 0xff00) >> 5 // cflush size - if cache == 0 && maxExtendedFunction() >= 0x80000006 { - _, _, ecx, _ := cpuid(0x80000006) - cache = ecx & 0xff // cacheline size - } - // TODO: Read from Cache and TLB Information - return int(cache) -} - -func (c *CPUInfo) cacheSize() { - c.Cache.L1D = -1 - c.Cache.L1I = -1 - c.Cache.L2 = -1 - c.Cache.L3 = -1 - vendor := vendorID() - switch vendor { - case Intel: - if maxFunctionID() < 4 { - return - } - for i := uint32(0); ; i++ { - eax, ebx, ecx, _ := cpuidex(4, i) - cacheType := eax & 15 - if cacheType == 0 { - break - } - cacheLevel := (eax >> 5) & 7 - coherency := int(ebx&0xfff) + 1 - partitions := int((ebx>>12)&0x3ff) + 1 - associativity := int((ebx>>22)&0x3ff) + 1 - sets := int(ecx) + 1 - size := associativity * partitions * coherency * sets - switch cacheLevel { - case 1: - if cacheType == 1 { - // 1 = Data Cache - c.Cache.L1D = size - } else if cacheType == 2 { - // 2 = Instruction Cache - c.Cache.L1I = size - } else { - if c.Cache.L1D < 0 { - c.Cache.L1I = size - } - if c.Cache.L1I < 0 { - c.Cache.L1I = size - } - } - case 2: - c.Cache.L2 = size - case 3: - c.Cache.L3 = size - } - } - case AMD: - // Untested. - if maxExtendedFunction() < 0x80000005 { - return - } - _, _, ecx, edx := cpuid(0x80000005) - c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024) - c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024) - - if maxExtendedFunction() < 0x80000006 { - return - } - _, _, ecx, _ = cpuid(0x80000006) - c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) - } - - return -} - -type SGXSupport struct { - Available bool - SGX1Supported bool - SGX2Supported bool - MaxEnclaveSizeNot64 int64 - MaxEnclaveSize64 int64 -} - -func sgx(available bool) (rval SGXSupport) { - rval.Available = available - - if !available { - return - } - - a, _, _, d := cpuidex(0x12, 0) - rval.SGX1Supported = a&0x01 != 0 - rval.SGX2Supported = a&0x02 != 0 - rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2 - rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2 - - return -} - -func support() Flags { - mfi := maxFunctionID() - vend := vendorID() - if mfi < 0x1 { - return 0 - } - rval := uint64(0) - _, _, c, d := cpuid(1) - if (d & (1 << 15)) != 0 { - rval |= CMOV - } - if (d & (1 << 23)) != 0 { - rval |= MMX - } - if (d & (1 << 25)) != 0 { - rval |= MMXEXT - } - if (d & (1 << 25)) != 0 { - rval |= SSE - } - if (d & (1 << 26)) != 0 { - rval |= SSE2 - } - if (c & 1) != 0 { - rval |= SSE3 - } - if (c & 0x00000200) != 0 { - rval |= SSSE3 - } - if (c & 0x00080000) != 0 { - rval |= SSE4 - } - if (c & 0x00100000) != 0 { - rval |= SSE42 - } - if (c & (1 << 25)) != 0 { - rval |= AESNI - } - if (c & (1 << 1)) != 0 { - rval |= CLMUL - } - if c&(1<<23) != 0 { - rval |= POPCNT - } - if c&(1<<30) != 0 { - rval |= RDRAND - } - if c&(1<<29) != 0 { - rval |= F16C - } - if c&(1<<13) != 0 { - rval |= CX16 - } - if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 { - if threadsPerCore() > 1 { - rval |= HTT - } - } - - // Check XGETBV, OXSAVE and AVX bits - if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { - // Check for OS support - eax, _ := xgetbv(0) - if (eax & 0x6) == 0x6 { - rval |= AVX - if (c & 0x00001000) != 0 { - rval |= FMA3 - } - } - } - - // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. - if mfi >= 7 { - _, ebx, ecx, _ := cpuidex(7, 0) - if (rval&AVX) != 0 && (ebx&0x00000020) != 0 { - rval |= AVX2 - } - if (ebx & 0x00000008) != 0 { - rval |= BMI1 - if (ebx & 0x00000100) != 0 { - rval |= BMI2 - } - } - if ebx&(1<<2) != 0 { - rval |= SGX - } - if ebx&(1<<4) != 0 { - rval |= HLE - } - if ebx&(1<<9) != 0 { - rval |= ERMS - } - if ebx&(1<<11) != 0 { - rval |= RTM - } - if ebx&(1<<14) != 0 { - rval |= MPX - } - if ebx&(1<<18) != 0 { - rval |= RDSEED - } - if ebx&(1<<19) != 0 { - rval |= ADX - } - if ebx&(1<<29) != 0 { - rval |= SHA - } - - // Only detect AVX-512 features if XGETBV is supported - if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { - // Check for OS support - eax, _ := xgetbv(0) - - // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and - // ZMM16-ZMM31 state are enabled by OS) - /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). - if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 { - if ebx&(1<<16) != 0 { - rval |= AVX512F - } - if ebx&(1<<17) != 0 { - rval |= AVX512DQ - } - if ebx&(1<<21) != 0 { - rval |= AVX512IFMA - } - if ebx&(1<<26) != 0 { - rval |= AVX512PF - } - if ebx&(1<<27) != 0 { - rval |= AVX512ER - } - if ebx&(1<<28) != 0 { - rval |= AVX512CD - } - if ebx&(1<<30) != 0 { - rval |= AVX512BW - } - if ebx&(1<<31) != 0 { - rval |= AVX512VL - } - // ecx - if ecx&(1<<1) != 0 { - rval |= AVX512VBMI - } - } - } - } - - if maxExtendedFunction() >= 0x80000001 { - _, _, c, d := cpuid(0x80000001) - if (c & (1 << 5)) != 0 { - rval |= LZCNT - rval |= POPCNT - } - if (d & (1 << 31)) != 0 { - rval |= AMD3DNOW - } - if (d & (1 << 30)) != 0 { - rval |= AMD3DNOWEXT - } - if (d & (1 << 23)) != 0 { - rval |= MMX - } - if (d & (1 << 22)) != 0 { - rval |= MMXEXT - } - if (c & (1 << 6)) != 0 { - rval |= SSE4A - } - if d&(1<<20) != 0 { - rval |= NX - } - if d&(1<<27) != 0 { - rval |= RDTSCP - } - - /* Allow for selectively disabling SSE2 functions on AMD processors - with SSE2 support but not SSE4a. This includes Athlon64, some - Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster - than SSE2 often enough to utilize this special-case flag. - AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case - so that SSE2 is used unless explicitly disabled by checking - AV_CPU_FLAG_SSE2SLOW. */ - if vendorID() != Intel && - rval&SSE2 != 0 && (c&0x00000040) == 0 { - rval |= SSE2SLOW - } - - /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be - * used unless the OS has AVX support. */ - if (rval & AVX) != 0 { - if (c & 0x00000800) != 0 { - rval |= XOP - } - if (c & 0x00010000) != 0 { - rval |= FMA4 - } - } - - if vendorID() == Intel { - family, model := familyModel() - if family == 6 && (model == 9 || model == 13 || model == 14) { - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and - * 6/14 (core1 "yonah") theoretically support sse2, but it's - * usually slower than mmx. */ - if (rval & SSE2) != 0 { - rval |= SSE2SLOW - } - if (rval & SSE3) != 0 { - rval |= SSE3SLOW - } - } - /* The Atom processor has SSSE3 support, which is useful in many cases, - * but sometimes the SSSE3 version is slower than the SSE2 equivalent - * on the Atom, but is generally faster on other processors supporting - * SSSE3. This flag allows for selectively disabling certain SSSE3 - * functions on the Atom. */ - if family == 6 && model == 28 { - rval |= ATOM - } - } - } - return Flags(rval) -} - -func valAsString(values ...uint32) []byte { - r := make([]byte, 4*len(values)) - for i, v := range values { - dst := r[i*4:] - dst[0] = byte(v & 0xff) - dst[1] = byte((v >> 8) & 0xff) - dst[2] = byte((v >> 16) & 0xff) - dst[3] = byte((v >> 24) & 0xff) - switch { - case dst[0] == 0: - return r[:i*4] - case dst[1] == 0: - return r[:i*4+1] - case dst[2] == 0: - return r[:i*4+2] - case dst[3] == 0: - return r[:i*4+3] - } - } - return r -} diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s deleted file mode 100644 index 4d731711e4..0000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid_386.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build 386,!gccgo - -// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuid(SB), 7, $0 - XORL CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+4(FP) - MOVL BX, ebx+8(FP) - MOVL CX, ecx+12(FP) - MOVL DX, edx+16(FP) - RET - -// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func xgetbv(index uint32) (eax, edx uint32) -TEXT ·asmXgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+4(FP) - MOVL DX, edx+8(FP) - RET - -// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) -TEXT ·asmRdtscpAsm(SB), 7, $0 - BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP - MOVL AX, eax+0(FP) - MOVL BX, ebx+4(FP) - MOVL CX, ecx+8(FP) - MOVL DX, edx+12(FP) - RET diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s deleted file mode 100644 index 3c1d60e422..0000000000 --- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -//+build amd64,!gccgo - -// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuid(SB), 7, $0 - XORQ CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·asmCpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func asmXgetbv(index uint32) (eax, edx uint32) -TEXT ·asmXgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+8(FP) - MOVL DX, edx+12(FP) - RET - -// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) -TEXT ·asmRdtscpAsm(SB), 7, $0 - BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP - MOVL AX, eax+0(FP) - MOVL BX, ebx+4(FP) - MOVL CX, ecx+8(FP) - MOVL DX, edx+12(FP) - RET diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go deleted file mode 100644 index a5f04dd6d0..0000000000 --- a/vendor/github.com/klauspost/cpuid/detect_intel.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build 386,!gccgo amd64,!gccgo - -package cpuid - -func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) -func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -func asmXgetbv(index uint32) (eax, edx uint32) -func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) - -func initCPU() { - cpuid = asmCpuid - cpuidex = asmCpuidex - xgetbv = asmXgetbv - rdtscpAsm = asmRdtscpAsm -} diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go deleted file mode 100644 index 909c5d9a7a..0000000000 --- a/vendor/github.com/klauspost/cpuid/detect_ref.go +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. - -// +build !amd64,!386 gccgo - -package cpuid - -func initCPU() { - cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } - - cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } - - xgetbv = func(index uint32) (eax, edx uint32) { - return 0, 0 - } - - rdtscpAsm = func() (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 - } -} diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go deleted file mode 100644 index c060b8165e..0000000000 --- a/vendor/github.com/klauspost/cpuid/generate.go +++ /dev/null @@ -1,3 +0,0 @@ -package cpuid - -//go:generate go run private-gen.go diff --git a/vendor/github.com/klauspost/crc32/.gitignore b/vendor/github.com/klauspost/crc32/.gitignore deleted file mode 100644 index daf913b1b3..0000000000 --- a/vendor/github.com/klauspost/crc32/.gitignore +++ /dev/null @@ -1,24 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof diff --git a/vendor/github.com/klauspost/crc32/.travis.yml b/vendor/github.com/klauspost/crc32/.travis.yml deleted file mode 100644 index de64ae491f..0000000000 --- a/vendor/github.com/klauspost/crc32/.travis.yml +++ /dev/null @@ -1,13 +0,0 @@ -language: go
-
-go:
- - 1.3
- - 1.4
- - 1.5
- - 1.6
- - 1.7
- - tip
-
-script:
- - go test -v .
- - go test -v -race .
diff --git a/vendor/github.com/klauspost/crc32/LICENSE b/vendor/github.com/klauspost/crc32/LICENSE deleted file mode 100644 index 4fd5963e39..0000000000 --- a/vendor/github.com/klauspost/crc32/LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2012 The Go Authors. All rights reserved. -Copyright (c) 2015 Klaus Post - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md deleted file mode 100644 index 029625d360..0000000000 --- a/vendor/github.com/klauspost/crc32/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# crc32 -CRC32 hash with x64 optimizations - -This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. - -[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32) - -# usage - -Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. - -Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. - -# changes -* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match. -* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. - - -# performance - -For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back. - - -For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: -``` -benchmark old ns/op new ns/op delta -BenchmarkCrc32KB 99955 10258 -89.74% - -benchmark old MB/s new MB/s speedup -BenchmarkCrc32KB 327.83 3194.20 9.74x -``` - -For other tables and "CLMUL" capable machines the performance is the same as the standard library. - -Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. - -``` -Std: Standard Go 1.5 library -Crc: Indicates IEEE type CRC. -40B: Size of each slice encoded. -NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). -Castagnoli: Castagnoli CRC type. - -BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s -BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) -BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) - -BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s -BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) -BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) - -BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) -BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) -BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) - -BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) -BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) -BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) - -BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s -BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) -BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) -BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s -BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) -BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) -BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s -BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) -BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) -BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s -BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) -BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) -BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) -``` - -The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. - -However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. - -# license - -Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go deleted file mode 100644 index 8aa91b17e9..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32.go +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32, -// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for -// information. -// -// Polynomials are represented in LSB-first form also known as reversed representation. -// -// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials -// for information. -package crc32 - -import ( - "hash" - "sync" -) - -// The size of a CRC-32 checksum in bytes. -const Size = 4 - -// Predefined polynomials. -const ( - // IEEE is by far and away the most common CRC-32 polynomial. - // Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ... - IEEE = 0xedb88320 - - // Castagnoli's polynomial, used in iSCSI. - // Has better error detection characteristics than IEEE. - // http://dx.doi.org/10.1109/26.231911 - Castagnoli = 0x82f63b78 - - // Koopman's polynomial. - // Also has better error detection characteristics than IEEE. - // http://dx.doi.org/10.1109/DSN.2002.1028931 - Koopman = 0xeb31d82e -) - -// Table is a 256-word table representing the polynomial for efficient processing. -type Table [256]uint32 - -// This file makes use of functions implemented in architecture-specific files. -// The interface that they implement is as follows: -// -// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE -// // algorithm is available. -// archAvailableIEEE() bool -// -// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm. -// // It can only be called if archAvailableIEEE() returns true. -// archInitIEEE() -// -// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if -// // archInitIEEE() was previously called. -// archUpdateIEEE(crc uint32, p []byte) uint32 -// -// // archAvailableCastagnoli reports whether an architecture-specific -// // CRC32-C algorithm is available. -// archAvailableCastagnoli() bool -// -// // archInitCastagnoli initializes the architecture-specific CRC32-C -// // algorithm. It can only be called if archAvailableCastagnoli() returns -// // true. -// archInitCastagnoli() -// -// // archUpdateCastagnoli updates the given CRC32-C. It can only be called -// // if archInitCastagnoli() was previously called. -// archUpdateCastagnoli(crc uint32, p []byte) uint32 - -// castagnoliTable points to a lazily initialized Table for the Castagnoli -// polynomial. MakeTable will always return this value when asked to make a -// Castagnoli table so we can compare against it to find when the caller is -// using this polynomial. -var castagnoliTable *Table -var castagnoliTable8 *slicing8Table -var castagnoliArchImpl bool -var updateCastagnoli func(crc uint32, p []byte) uint32 -var castagnoliOnce sync.Once - -func castagnoliInit() { - castagnoliTable = simpleMakeTable(Castagnoli) - castagnoliArchImpl = archAvailableCastagnoli() - - if castagnoliArchImpl { - archInitCastagnoli() - updateCastagnoli = archUpdateCastagnoli - } else { - // Initialize the slicing-by-8 table. - castagnoliTable8 = slicingMakeTable(Castagnoli) - updateCastagnoli = func(crc uint32, p []byte) uint32 { - return slicingUpdate(crc, castagnoliTable8, p) - } - } -} - -// IEEETable is the table for the IEEE polynomial. -var IEEETable = simpleMakeTable(IEEE) - -// ieeeTable8 is the slicing8Table for IEEE -var ieeeTable8 *slicing8Table -var ieeeArchImpl bool -var updateIEEE func(crc uint32, p []byte) uint32 -var ieeeOnce sync.Once - -func ieeeInit() { - ieeeArchImpl = archAvailableIEEE() - - if ieeeArchImpl { - archInitIEEE() - updateIEEE = archUpdateIEEE - } else { - // Initialize the slicing-by-8 table. - ieeeTable8 = slicingMakeTable(IEEE) - updateIEEE = func(crc uint32, p []byte) uint32 { - return slicingUpdate(crc, ieeeTable8, p) - } - } -} - -// MakeTable returns a Table constructed from the specified polynomial. -// The contents of this Table must not be modified. -func MakeTable(poly uint32) *Table { - switch poly { - case IEEE: - ieeeOnce.Do(ieeeInit) - return IEEETable - case Castagnoli: - castagnoliOnce.Do(castagnoliInit) - return castagnoliTable - } - return simpleMakeTable(poly) -} - -// digest represents the partial evaluation of a checksum. -type digest struct { - crc uint32 - tab *Table -} - -// New creates a new hash.Hash32 computing the CRC-32 checksum -// using the polynomial represented by the Table. -// Its Sum method will lay the value out in big-endian byte order. -func New(tab *Table) hash.Hash32 { - if tab == IEEETable { - ieeeOnce.Do(ieeeInit) - } - return &digest{0, tab} -} - -// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum -// using the IEEE polynomial. -// Its Sum method will lay the value out in big-endian byte order. -func NewIEEE() hash.Hash32 { return New(IEEETable) } - -func (d *digest) Size() int { return Size } - -func (d *digest) BlockSize() int { return 1 } - -func (d *digest) Reset() { d.crc = 0 } - -// Update returns the result of adding the bytes in p to the crc. -func Update(crc uint32, tab *Table, p []byte) uint32 { - switch tab { - case castagnoliTable: - return updateCastagnoli(crc, p) - case IEEETable: - // Unfortunately, because IEEETable is exported, IEEE may be used without a - // call to MakeTable. We have to make sure it gets initialized in that case. - ieeeOnce.Do(ieeeInit) - return updateIEEE(crc, p) - default: - return simpleUpdate(crc, tab, p) - } -} - -func (d *digest) Write(p []byte) (n int, err error) { - switch d.tab { - case castagnoliTable: - d.crc = updateCastagnoli(d.crc, p) - case IEEETable: - // We only create digest objects through New() which takes care of - // initialization in this case. - d.crc = updateIEEE(d.crc, p) - default: - d.crc = simpleUpdate(d.crc, d.tab, p) - } - return len(p), nil -} - -func (d *digest) Sum32() uint32 { return d.crc } - -func (d *digest) Sum(in []byte) []byte { - s := d.Sum32() - return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) -} - -// Checksum returns the CRC-32 checksum of data -// using the polynomial represented by the Table. -func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } - -// ChecksumIEEE returns the CRC-32 checksum of data -// using the IEEE polynomial. -func ChecksumIEEE(data []byte) uint32 { - ieeeOnce.Do(ieeeInit) - return updateIEEE(0, data) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go deleted file mode 100644 index af2a0b844b..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.go +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine,!gccgo - -// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a -// description of the interface that each architecture-specific file -// implements. - -package crc32 - -import "unsafe" - -// This file contains the code to call the SSE 4.2 version of the Castagnoli -// and IEEE CRC. - -// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use -// CPUID to test for SSE 4.1, 4.2 and CLMUL support. -func haveSSE41() bool -func haveSSE42() bool -func haveCLMUL() bool - -// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42(crc uint32, p []byte) uint32 - -// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42Triple( - crcA, crcB, crcC uint32, - a, b, c []byte, - rounds uint32, -) (retA uint32, retB uint32, retC uint32) - -// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ -// instruction as well as SSE 4.1. -//go:noescape -func ieeeCLMUL(crc uint32, p []byte) uint32 - -var sse42 = haveSSE42() -var useFastIEEE = haveCLMUL() && haveSSE41() - -const castagnoliK1 = 168 -const castagnoliK2 = 1344 - -type sse42Table [4]Table - -var castagnoliSSE42TableK1 *sse42Table -var castagnoliSSE42TableK2 *sse42Table - -func archAvailableCastagnoli() bool { - return sse42 -} - -func archInitCastagnoli() { - if !sse42 { - panic("arch-specific Castagnoli not available") - } - castagnoliSSE42TableK1 = new(sse42Table) - castagnoliSSE42TableK2 = new(sse42Table) - // See description in updateCastagnoli. - // t[0][i] = CRC(i000, O) - // t[1][i] = CRC(0i00, O) - // t[2][i] = CRC(00i0, O) - // t[3][i] = CRC(000i, O) - // where O is a sequence of K zeros. - var tmp [castagnoliK2]byte - for b := 0; b < 4; b++ { - for i := 0; i < 256; i++ { - val := uint32(i) << uint32(b*8) - castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1]) - castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:]) - } - } -} - -// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the -// table given) with the given initial crc value. This corresponds to -// CRC(crc, O) in the description in updateCastagnoli. -func castagnoliShift(table *sse42Table, crc uint32) uint32 { - return table[3][crc>>24] ^ - table[2][(crc>>16)&0xFF] ^ - table[1][(crc>>8)&0xFF] ^ - table[0][crc&0xFF] -} - -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !sse42 { - panic("not available") - } - - // This method is inspired from the algorithm in Intel's white paper: - // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" - // The same strategy of splitting the buffer in three is used but the - // combining calculation is different; the complete derivation is explained - // below. - // - // -- The basic idea -- - // - // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a - // time. In recent Intel architectures the instruction takes 3 cycles; - // however the processor can pipeline up to three instructions if they - // don't depend on each other. - // - // Roughly this means that we can process three buffers in about the same - // time we can process one buffer. - // - // The idea is then to split the buffer in three, CRC the three pieces - // separately and then combine the results. - // - // Combining the results requires precomputed tables, so we must choose a - // fixed buffer length to optimize. The longer the length, the faster; but - // only buffers longer than this length will use the optimization. We choose - // two cutoffs and compute tables for both: - // - one around 512: 168*3=504 - // - one around 4KB: 1344*3=4032 - // - // -- The nitty gritty -- - // - // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with - // initial non-inverted CRC I). This function has the following properties: - // (a) CRC(I, AB) = CRC(CRC(I, A), B) - // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B) - // - // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of - // K bytes each, where K is a fixed constant. Let O be the sequence of K zero - // bytes. - // - // CRC(I, ABC) = CRC(I, ABO xor C) - // = CRC(I, ABO) xor CRC(0, C) - // = CRC(CRC(I, AB), O) xor CRC(0, C) - // = CRC(CRC(I, AO xor B), O) xor CRC(0, C) - // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C) - // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C) - // - // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B), - // and CRC(0, C) efficiently. We just need to find a way to quickly compute - // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these - // values; since we can't have a 32-bit table, we break it up into four - // 8-bit tables: - // - // CRC(uvwx, O) = CRC(u000, O) xor - // CRC(0v00, O) xor - // CRC(00w0, O) xor - // CRC(000x, O) - // - // We can compute tables corresponding to the four terms for all 8-bit - // values. - - crc = ^crc - - // If a buffer is long enough to use the optimization, process the first few - // bytes to align the buffer to an 8 byte boundary (if necessary). - if len(p) >= castagnoliK1*3 { - delta := int(uintptr(unsafe.Pointer(&p[0])) & 7) - if delta != 0 { - delta = 8 - delta - crc = castagnoliSSE42(crc, p[:delta]) - p = p[delta:] - } - } - - // Process 3*K2 at a time. - for len(p) >= castagnoliK2*3 { - // Compute CRC(I, A), CRC(0, B), and CRC(0, C). - crcA, crcB, crcC := castagnoliSSE42Triple( - crc, 0, 0, - p, p[castagnoliK2:], p[castagnoliK2*2:], - castagnoliK2/24) - - // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) - crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB - // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) - crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC - p = p[castagnoliK2*3:] - } - - // Process 3*K1 at a time. - for len(p) >= castagnoliK1*3 { - // Compute CRC(I, A), CRC(0, B), and CRC(0, C). - crcA, crcB, crcC := castagnoliSSE42Triple( - crc, 0, 0, - p, p[castagnoliK1:], p[castagnoliK1*2:], - castagnoliK1/24) - - // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) - crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB - // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) - crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC - p = p[castagnoliK1*3:] - } - - // Use the simple implementation for what's left. - crc = castagnoliSSE42(crc, p) - return ^crc -} - -func archAvailableIEEE() bool { - return useFastIEEE -} - -var archIeeeTable8 *slicing8Table - -func archInitIEEE() { - if !useFastIEEE { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archIeeeTable8 = slicingMakeTable(IEEE) -} - -func archUpdateIEEE(crc uint32, p []byte) uint32 { - if !useFastIEEE { - panic("not available") - } - - if len(p) >= 64 { - left := len(p) & 15 - do := len(p) - left - crc = ^ieeeCLMUL(^crc, p[:do]) - p = p[do:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archIeeeTable8, p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s deleted file mode 100644 index e8a7941ce7..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.s +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build gc - -#define NOSPLIT 4 -#define RODATA 8 - -// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. -// -// func castagnoliSSE42(crc uint32, p []byte) uint32 -TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - // If there are fewer than 8 bytes to process, skip alignment. - CMPQ CX, $8 - JL less_than_8 - - MOVQ SI, BX - ANDQ $7, BX - JZ aligned - - // Process the first few bytes to 8-byte align the input. - - // BX = 8 - BX. We need to process this many bytes to align. - SUBQ $1, BX - XORQ $7, BX - - BTQ $0, BX - JNC align_2 - - CRC32B (SI), AX - DECQ CX - INCQ SI - -align_2: - BTQ $1, BX - JNC align_4 - - // CRC32W (SI), AX - BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - - SUBQ $2, CX - ADDQ $2, SI - -align_4: - BTQ $2, BX - JNC aligned - - // CRC32L (SI), AX - BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - - SUBQ $4, CX - ADDQ $4, SI - -aligned: - // The input is now 8-byte aligned and we can process 8-byte chunks. - CMPQ CX, $8 - JL less_than_8 - - CRC32Q (SI), AX - ADDQ $8, SI - SUBQ $8, CX - JMP aligned - -less_than_8: - // We may have some bytes left over; process 4 bytes, then 2, then 1. - BTQ $2, CX - JNC less_than_4 - - // CRC32L (SI), AX - BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - ADDQ $4, SI - -less_than_4: - BTQ $1, CX - JNC less_than_2 - - // CRC32W (SI), AX - BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - ADDQ $2, SI - -less_than_2: - BTQ $0, CX - JNC done - - CRC32B (SI), AX - -done: - MOVL AX, ret+32(FP) - RET - -// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) -// bytes from each buffer. -// -// func castagnoliSSE42Triple( -// crc1, crc2, crc3 uint32, -// a, b, c []byte, -// rounds uint32, -// ) (retA uint32, retB uint32, retC uint32) -TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 - MOVL crcA+0(FP), AX - MOVL crcB+4(FP), CX - MOVL crcC+8(FP), DX - - MOVQ a+16(FP), R8 // data pointer - MOVQ b+40(FP), R9 // data pointer - MOVQ c+64(FP), R10 // data pointer - - MOVL rounds+88(FP), R11 - -loop: - CRC32Q (R8), AX - CRC32Q (R9), CX - CRC32Q (R10), DX - - CRC32Q 8(R8), AX - CRC32Q 8(R9), CX - CRC32Q 8(R10), DX - - CRC32Q 16(R8), AX - CRC32Q 16(R9), CX - CRC32Q 16(R10), DX - - ADDQ $24, R8 - ADDQ $24, R9 - ADDQ $24, R10 - - DECQ R11 - JNZ loop - - MOVL AX, retA+96(FP) - MOVL CX, retB+100(FP) - MOVL DX, retC+104(FP) - RET - -// func haveSSE42() bool -TEXT ·haveSSE42(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $20, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// func haveCLMUL() bool -TEXT ·haveCLMUL(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $1, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// func haveSSE41() bool -TEXT ·haveSSE41(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $19, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// CRC32 polynomial data -// -// These constants are lifted from the -// Linux kernel, since they avoid the costly -// PSHUFB 16 byte reversal proposed in the -// original Intel paper. -DATA r2r1kp<>+0(SB)/8, $0x154442bd4 -DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 -DATA r4r3kp<>+0(SB)/8, $0x1751997d0 -DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e -DATA rupolykp<>+0(SB)/8, $0x1db710641 -DATA rupolykp<>+8(SB)/8, $0x1f7011641 -DATA r5kp<>+0(SB)/8, $0x163cd6124 - -GLOBL r2r1kp<>(SB), RODATA, $16 -GLOBL r4r3kp<>(SB), RODATA, $16 -GLOBL rupolykp<>(SB), RODATA, $16 -GLOBL r5kp<>(SB), RODATA, $8 - -// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// len(p) must be at least 64, and must be a multiple of 16. - -// func ieeeCLMUL(crc uint32, p []byte) uint32 -TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 - MOVL crc+0(FP), X0 // Initial CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - MOVOU (SI), X1 - MOVOU 16(SI), X2 - MOVOU 32(SI), X3 - MOVOU 48(SI), X4 - PXOR X0, X1 - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left - JB remain64 - - MOVOA r2r1kp<>+0(SB), X0 - -loopback64: - MOVOA X1, X5 - MOVOA X2, X6 - MOVOA X3, X7 - MOVOA X4, X8 - - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0, X0, X2 - PCLMULQDQ $0, X0, X3 - PCLMULQDQ $0, X0, X4 - - // Load next early - MOVOU (SI), X11 - MOVOU 16(SI), X12 - MOVOU 32(SI), X13 - MOVOU 48(SI), X14 - - PCLMULQDQ $0x11, X0, X5 - PCLMULQDQ $0x11, X0, X6 - PCLMULQDQ $0x11, X0, X7 - PCLMULQDQ $0x11, X0, X8 - - PXOR X5, X1 - PXOR X6, X2 - PXOR X7, X3 - PXOR X8, X4 - - PXOR X11, X1 - PXOR X12, X2 - PXOR X13, X3 - PXOR X14, X4 - - ADDQ $0x40, DI - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left? - JGE loopback64 - - // Fold result into a single register (X1) -remain64: - MOVOA r4r3kp<>+0(SB), X0 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X2, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X3, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X4, X1 - - // If there is less than 16 bytes left we are done - CMPQ CX, $16 - JB finish - - // Encode 16 bytes -remain16: - MOVOU (SI), X10 - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X10, X1 - SUBQ $16, CX - ADDQ $16, SI - CMPQ CX, $16 - JGE remain16 - -finish: - // Fold final result into 32 bits and return it - PCMPEQB X3, X3 - PCLMULQDQ $1, X1, X0 - PSRLDQ $8, X1 - PXOR X0, X1 - - MOVOA X1, X2 - MOVQ r5kp<>+0(SB), X0 - - // Creates 32 bit mask. Note that we don't care about upper half. - PSRLQ $32, X3 - - PSRLDQ $4, X2 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - MOVOA rupolykp<>+0(SB), X0 - - MOVOA X1, X2 - PAND X3, X1 - PCLMULQDQ $0x10, X0, X1 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - // PEXTRD $1, X1, AX (SSE 4.1) - BYTE $0x66; BYTE $0x0f; BYTE $0x3a - BYTE $0x16; BYTE $0xc8; BYTE $0x01 - MOVL AX, ret+32(FP) - - RET diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go deleted file mode 100644 index 3222b06a5a..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine,!gccgo - -package crc32 - -// This file contains the code to call the SSE 4.2 version of the Castagnoli -// CRC. - -// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2 -// support. -func haveSSE42() bool - -// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42(crc uint32, p []byte) uint32 - -var sse42 = haveSSE42() - -func archAvailableCastagnoli() bool { - return sse42 -} - -func archInitCastagnoli() { - if !sse42 { - panic("not available") - } - // No initialization necessary. -} - -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !sse42 { - panic("not available") - } - return castagnoliSSE42(crc, p) -} - -func archAvailableIEEE() bool { return false } -func archInitIEEE() { panic("not available") } -func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s deleted file mode 100644 index a578d685cc..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build gc - -#define NOSPLIT 4 -#define RODATA 8 - -// func castagnoliSSE42(crc uint32, p []byte) uint32 -TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // CRC value - MOVL p+4(FP), SI // data pointer - MOVL p_len+8(FP), CX // len(p) - - NOTL AX - - // If there's less than 8 bytes to process, we do it byte-by-byte. - CMPQ CX, $8 - JL cleanup - - // Process individual bytes until the input is 8-byte aligned. -startup: - MOVQ SI, BX - ANDQ $7, BX - JZ aligned - - CRC32B (SI), AX - DECQ CX - INCQ SI - JMP startup - -aligned: - // The input is now 8-byte aligned and we can process 8-byte chunks. - CMPQ CX, $8 - JL cleanup - - CRC32Q (SI), AX - ADDQ $8, SI - SUBQ $8, CX - JMP aligned - -cleanup: - // We may have some bytes left over that we process one at a time. - CMPQ CX, $0 - JE done - - CRC32B (SI), AX - INCQ SI - DECQ CX - JMP cleanup - -done: - NOTL AX - MOVL AX, ret+16(FP) - RET - -// func haveSSE42() bool -TEXT ·haveSSE42(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $20, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go deleted file mode 100644 index abacbb663d..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_generic.go +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file contains CRC32 algorithms that are not specific to any architecture -// and don't use hardware acceleration. -// -// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table. -// -// The slicing-by-8 algorithm is a faster implementation that uses a bigger -// table (8*256*4 bytes). - -package crc32 - -// simpleMakeTable allocates and constructs a Table for the specified -// polynomial. The table is suitable for use with the simple algorithm -// (simpleUpdate). -func simpleMakeTable(poly uint32) *Table { - t := new(Table) - simplePopulateTable(poly, t) - return t -} - -// simplePopulateTable constructs a Table for the specified polynomial, suitable -// for use with simpleUpdate. -func simplePopulateTable(poly uint32, t *Table) { - for i := 0; i < 256; i++ { - crc := uint32(i) - for j := 0; j < 8; j++ { - if crc&1 == 1 { - crc = (crc >> 1) ^ poly - } else { - crc >>= 1 - } - } - t[i] = crc - } -} - -// simpleUpdate uses the simple algorithm to update the CRC, given a table that -// was previously computed using simpleMakeTable. -func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 { - crc = ^crc - for _, v := range p { - crc = tab[byte(crc)^v] ^ (crc >> 8) - } - return ^crc -} - -// Use slicing-by-8 when payload >= this value. -const slicing8Cutoff = 16 - -// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm. -type slicing8Table [8]Table - -// slicingMakeTable constructs a slicing8Table for the specified polynomial. The -// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate). -func slicingMakeTable(poly uint32) *slicing8Table { - t := new(slicing8Table) - simplePopulateTable(poly, &t[0]) - for i := 0; i < 256; i++ { - crc := t[0][i] - for j := 1; j < 8; j++ { - crc = t[0][crc&0xFF] ^ (crc >> 8) - t[j][i] = crc - } - } - return t -} - -// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a -// table that was previously computed using slicingMakeTable. -func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 { - if len(p) >= slicing8Cutoff { - crc = ^crc - for len(p) > 8 { - crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 - crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ - tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ - tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] - p = p[8:] - } - crc = ^crc - } - if len(p) == 0 { - return crc - } - return simpleUpdate(crc, &tab[0], p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go deleted file mode 100644 index cc960764bc..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_otherarch.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64,!amd64p32,!s390x - -package crc32 - -func archAvailableIEEE() bool { return false } -func archInitIEEE() { panic("not available") } -func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } - -func archAvailableCastagnoli() bool { return false } -func archInitCastagnoli() { panic("not available") } -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go deleted file mode 100644 index ce96f03281..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_s390x.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build s390x - -package crc32 - -const ( - vxMinLen = 64 - vxAlignMask = 15 // align to 16 bytes -) - -// hasVectorFacility reports whether the machine has the z/Architecture -// vector facility installed and enabled. -func hasVectorFacility() bool - -var hasVX = hasVectorFacility() - -// vectorizedCastagnoli implements CRC32 using vector instructions. -// It is defined in crc32_s390x.s. -//go:noescape -func vectorizedCastagnoli(crc uint32, p []byte) uint32 - -// vectorizedIEEE implements CRC32 using vector instructions. -// It is defined in crc32_s390x.s. -//go:noescape -func vectorizedIEEE(crc uint32, p []byte) uint32 - -func archAvailableCastagnoli() bool { - return hasVX -} - -var archCastagnoliTable8 *slicing8Table - -func archInitCastagnoli() { - if !hasVX { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archCastagnoliTable8 = slicingMakeTable(Castagnoli) -} - -// archUpdateCastagnoli calculates the checksum of p using -// vectorizedCastagnoli. -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !hasVX { - panic("not available") - } - // Use vectorized function if data length is above threshold. - if len(p) >= vxMinLen { - aligned := len(p) & ^vxAlignMask - crc = vectorizedCastagnoli(crc, p[:aligned]) - p = p[aligned:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archCastagnoliTable8, p) -} - -func archAvailableIEEE() bool { - return hasVX -} - -var archIeeeTable8 *slicing8Table - -func archInitIEEE() { - if !hasVX { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archIeeeTable8 = slicingMakeTable(IEEE) -} - -// archUpdateIEEE calculates the checksum of p using vectorizedIEEE. -func archUpdateIEEE(crc uint32, p []byte) uint32 { - if !hasVX { - panic("not available") - } - // Use vectorized function if data length is above threshold. - if len(p) >= vxMinLen { - aligned := len(p) & ^vxAlignMask - crc = vectorizedIEEE(crc, p[:aligned]) - p = p[aligned:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archIeeeTable8, p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s deleted file mode 100644 index e980ca29d6..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_s390x.s +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build s390x - -#include "textflag.h" - -// Vector register range containing CRC-32 constants - -#define CONST_PERM_LE2BE V9 -#define CONST_R2R1 V10 -#define CONST_R4R3 V11 -#define CONST_R5 V12 -#define CONST_RU_POLY V13 -#define CONST_CRC_POLY V14 - -// The CRC-32 constant block contains reduction constants to fold and -// process particular chunks of the input data stream in parallel. -// -// Note that the constant definitions below are extended in order to compute -// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. -// The rightmost doubleword can be 0 to prevent contribution to the result or -// can be multiplied by 1 to perform an XOR without the need for a separate -// VECTOR EXCLUSIVE OR instruction. -// -// The polynomials used are bit-reflected: -// -// IEEE: P'(x) = 0x0edb88320 -// Castagnoli: P'(x) = 0x082f63b78 - -// IEEE polynomial constants -DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask -DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 -DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 -DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 -DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 -DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 -DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 -DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' -DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 - -GLOBL ·crcleconskp(SB), RODATA, $144 - -// Castagonli Polynomial constants -DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask -DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 -DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 -DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 -DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 -DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 -DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 -DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' -DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 - -GLOBL ·crccleconskp(SB), RODATA, $144 - -// func hasVectorFacility() bool -TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 - MOVD $x-24(SP), R1 - XC $24, 0(R1), 0(R1) // clear the storage - MOVD $2, R0 // R0 is the number of double words stored -1 - WORD $0xB2B01000 // STFLE 0(R1) - XOR R0, R0 // reset the value of R0 - MOVBZ z-8(SP), R1 - AND $0x40, R1 - BEQ novector - -vectorinstalled: - // check if the vector instruction has been enabled - VLEIB $0, $0xF, V16 - VLGVB $0, V16, R1 - CMPBNE R1, $0xF, novector - MOVB $1, ret+0(FP) // have vx - RET - -novector: - MOVB $0, ret+0(FP) // no vx - RET - -// The CRC-32 function(s) use these calling conventions: -// -// Parameters: -// -// R2: Initial CRC value, typically ~0; and final CRC (return) value. -// R3: Input buffer pointer, performance might be improved if the -// buffer is on a doubleword boundary. -// R4: Length of the buffer, must be 64 bytes or greater. -// -// Register usage: -// -// R5: CRC-32 constant pool base pointer. -// V0: Initial CRC value and intermediate constants and results. -// V1..V4: Data for CRC computation. -// V5..V8: Next data chunks that are fetched from the input buffer. -// -// V9..V14: CRC-32 constants. - -// func vectorizedIEEE(crc uint32, p []byte) uint32 -TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 - MOVWZ crc+0(FP), R2 // R2 stores the CRC value - MOVD p+8(FP), R3 // data pointer - MOVD p_len+16(FP), R4 // len(p) - - MOVD $·crcleconskp(SB), R5 - BR vectorizedBody<>(SB) - -// func vectorizedCastagnoli(crc uint32, p []byte) uint32 -TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 - MOVWZ crc+0(FP), R2 // R2 stores the CRC value - MOVD p+8(FP), R3 // data pointer - MOVD p_len+16(FP), R4 // len(p) - - // R5: crc-32 constant pool base pointer, constant is used to reduce crc - MOVD $·crccleconskp(SB), R5 - BR vectorizedBody<>(SB) - -TEXT vectorizedBody<>(SB), NOSPLIT, $0 - XOR $0xffffffff, R2 // NOTW R2 - VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY - - // Load the initial CRC value into the rightmost word of V0 - VZERO V0 - VLVGF $3, R2, V0 - - // Crash if the input size is less than 64-bytes. - CMP R4, $64 - BLT crash - - // Load a 64-byte data chunk and XOR with CRC - VLM 0(R3), V1, V4 // 64-bytes into V1..V4 - - // Reflect the data if the CRC operation is in the bit-reflected domain - VPERM V1, V1, CONST_PERM_LE2BE, V1 - VPERM V2, V2, CONST_PERM_LE2BE, V2 - VPERM V3, V3, CONST_PERM_LE2BE, V3 - VPERM V4, V4, CONST_PERM_LE2BE, V4 - - VX V0, V1, V1 // V1 ^= CRC - ADD $64, R3 // BUF = BUF + 64 - ADD $(-64), R4 - - // Check remaining buffer size and jump to proper folding method - CMP R4, $64 - BLT less_than_64bytes - -fold_64bytes_loop: - // Load the next 64-byte data chunk into V5 to V8 - VLM 0(R3), V5, V8 - VPERM V5, V5, CONST_PERM_LE2BE, V5 - VPERM V6, V6, CONST_PERM_LE2BE, V6 - VPERM V7, V7, CONST_PERM_LE2BE, V7 - VPERM V8, V8, CONST_PERM_LE2BE, V8 - - // Perform a GF(2) multiplication of the doublewords in V1 with - // the reduction constants in V0. The intermediate result is - // then folded (accumulated) with the next data chunk in V5 and - // stored in V1. Repeat this step for the register contents - // in V2, V3, and V4 respectively. - - VGFMAG CONST_R2R1, V1, V5, V1 - VGFMAG CONST_R2R1, V2, V6, V2 - VGFMAG CONST_R2R1, V3, V7, V3 - VGFMAG CONST_R2R1, V4, V8, V4 - - // Adjust buffer pointer and length for next loop - ADD $64, R3 // BUF = BUF + 64 - ADD $(-64), R4 // LEN = LEN - 64 - - CMP R4, $64 - BGE fold_64bytes_loop - -less_than_64bytes: - // Fold V1 to V4 into a single 128-bit value in V1 - VGFMAG CONST_R4R3, V1, V2, V1 - VGFMAG CONST_R4R3, V1, V3, V1 - VGFMAG CONST_R4R3, V1, V4, V1 - - // Check whether to continue with 64-bit folding - CMP R4, $16 - BLT final_fold - -fold_16bytes_loop: - VL 0(R3), V2 // Load next data chunk - VPERM V2, V2, CONST_PERM_LE2BE, V2 - - VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk - - // Adjust buffer pointer and size for folding next data chunk - ADD $16, R3 - ADD $-16, R4 - - // Process remaining data chunks - CMP R4, $16 - BGE fold_16bytes_loop - -final_fold: - VLEIB $7, $0x40, V9 - VSRLB V9, CONST_R4R3, V0 - VLEIG $0, $1, V0 - - VGFMG V0, V1, V1 - - VLEIB $7, $0x20, V9 // Shift by words - VSRLB V9, V1, V2 // Store remaining bits in V2 - VUPLLF V1, V1 // Split rightmost doubleword - VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 - - // The input values to the Barret reduction are the degree-63 polynomial - // in V1 (R(x)), degree-32 generator polynomial, and the reduction - // constant u. The Barret reduction result is the CRC value of R(x) mod - // P(x). - // - // The Barret reduction algorithm is defined as: - // - // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u - // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) - // 3. C(x) = R(x) XOR T2(x) mod x^32 - // - // Note: To compensate the division by x^32, use the vector unpack - // instruction to move the leftmost word into the leftmost doubleword - // of the vector register. The rightmost doubleword is multiplied - // with zero to not contribute to the intermedate results. - - // T1(x) = floor( R(x) / x^32 ) GF2MUL u - VUPLLF V1, V2 - VGFMG CONST_RU_POLY, V2, V2 - - // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in - // V2 and XOR the intermediate result, T2(x), with the value in V1. - // The final result is in the rightmost word of V2. - - VUPLLF V2, V2 - VGFMAG CONST_CRC_POLY, V2, V1, V2 - -done: - VLGVF $2, V2, R2 - XOR $0xffffffff, R2 // NOTW R2 - MOVWZ R2, ret + 32(FP) - RET - -crash: - MOVD $0, (R0) // input size is less than 64-bytes |