diff options
Diffstat (limited to 'vendor/github.com/klauspost/crc32')
-rw-r--r-- | vendor/github.com/klauspost/crc32/.gitignore | 24 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/.travis.yml | 13 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/LICENSE | 28 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/README.md | 87 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32.go | 207 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64.go | 230 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64.s | 319 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64p32.go | 43 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64p32.s | 67 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_generic.go | 89 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_otherarch.go | 15 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_s390x.go | 91 | ||||
-rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_s390x.s | 249 |
13 files changed, 0 insertions, 1462 deletions
diff --git a/vendor/github.com/klauspost/crc32/.gitignore b/vendor/github.com/klauspost/crc32/.gitignore deleted file mode 100644 index daf913b1b3..0000000000 --- a/vendor/github.com/klauspost/crc32/.gitignore +++ /dev/null @@ -1,24 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof diff --git a/vendor/github.com/klauspost/crc32/.travis.yml b/vendor/github.com/klauspost/crc32/.travis.yml deleted file mode 100644 index de64ae491f..0000000000 --- a/vendor/github.com/klauspost/crc32/.travis.yml +++ /dev/null @@ -1,13 +0,0 @@ -language: go
-
-go:
- - 1.3
- - 1.4
- - 1.5
- - 1.6
- - 1.7
- - tip
-
-script:
- - go test -v .
- - go test -v -race .
diff --git a/vendor/github.com/klauspost/crc32/LICENSE b/vendor/github.com/klauspost/crc32/LICENSE deleted file mode 100644 index 4fd5963e39..0000000000 --- a/vendor/github.com/klauspost/crc32/LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2012 The Go Authors. All rights reserved. -Copyright (c) 2015 Klaus Post - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md deleted file mode 100644 index 029625d360..0000000000 --- a/vendor/github.com/klauspost/crc32/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# crc32 -CRC32 hash with x64 optimizations - -This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. - -[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32) - -# usage - -Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. - -Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. - -# changes -* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match. -* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. - - -# performance - -For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back. - - -For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: -``` -benchmark old ns/op new ns/op delta -BenchmarkCrc32KB 99955 10258 -89.74% - -benchmark old MB/s new MB/s speedup -BenchmarkCrc32KB 327.83 3194.20 9.74x -``` - -For other tables and "CLMUL" capable machines the performance is the same as the standard library. - -Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. - -``` -Std: Standard Go 1.5 library -Crc: Indicates IEEE type CRC. -40B: Size of each slice encoded. -NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). -Castagnoli: Castagnoli CRC type. - -BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s -BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) -BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) - -BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s -BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) -BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) - -BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) -BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) -BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) - -BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) -BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) -BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) - -BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s -BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) -BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) -BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s -BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) -BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) -BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s -BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) -BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) -BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) - -BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s -BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) -BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) -BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) -``` - -The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. - -However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. - -# license - -Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go deleted file mode 100644 index 8aa91b17e9..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32.go +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32, -// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for -// information. -// -// Polynomials are represented in LSB-first form also known as reversed representation. -// -// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials -// for information. -package crc32 - -import ( - "hash" - "sync" -) - -// The size of a CRC-32 checksum in bytes. -const Size = 4 - -// Predefined polynomials. -const ( - // IEEE is by far and away the most common CRC-32 polynomial. - // Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ... - IEEE = 0xedb88320 - - // Castagnoli's polynomial, used in iSCSI. - // Has better error detection characteristics than IEEE. - // http://dx.doi.org/10.1109/26.231911 - Castagnoli = 0x82f63b78 - - // Koopman's polynomial. - // Also has better error detection characteristics than IEEE. - // http://dx.doi.org/10.1109/DSN.2002.1028931 - Koopman = 0xeb31d82e -) - -// Table is a 256-word table representing the polynomial for efficient processing. -type Table [256]uint32 - -// This file makes use of functions implemented in architecture-specific files. -// The interface that they implement is as follows: -// -// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE -// // algorithm is available. -// archAvailableIEEE() bool -// -// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm. -// // It can only be called if archAvailableIEEE() returns true. -// archInitIEEE() -// -// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if -// // archInitIEEE() was previously called. -// archUpdateIEEE(crc uint32, p []byte) uint32 -// -// // archAvailableCastagnoli reports whether an architecture-specific -// // CRC32-C algorithm is available. -// archAvailableCastagnoli() bool -// -// // archInitCastagnoli initializes the architecture-specific CRC32-C -// // algorithm. It can only be called if archAvailableCastagnoli() returns -// // true. -// archInitCastagnoli() -// -// // archUpdateCastagnoli updates the given CRC32-C. It can only be called -// // if archInitCastagnoli() was previously called. -// archUpdateCastagnoli(crc uint32, p []byte) uint32 - -// castagnoliTable points to a lazily initialized Table for the Castagnoli -// polynomial. MakeTable will always return this value when asked to make a -// Castagnoli table so we can compare against it to find when the caller is -// using this polynomial. -var castagnoliTable *Table -var castagnoliTable8 *slicing8Table -var castagnoliArchImpl bool -var updateCastagnoli func(crc uint32, p []byte) uint32 -var castagnoliOnce sync.Once - -func castagnoliInit() { - castagnoliTable = simpleMakeTable(Castagnoli) - castagnoliArchImpl = archAvailableCastagnoli() - - if castagnoliArchImpl { - archInitCastagnoli() - updateCastagnoli = archUpdateCastagnoli - } else { - // Initialize the slicing-by-8 table. - castagnoliTable8 = slicingMakeTable(Castagnoli) - updateCastagnoli = func(crc uint32, p []byte) uint32 { - return slicingUpdate(crc, castagnoliTable8, p) - } - } -} - -// IEEETable is the table for the IEEE polynomial. -var IEEETable = simpleMakeTable(IEEE) - -// ieeeTable8 is the slicing8Table for IEEE -var ieeeTable8 *slicing8Table -var ieeeArchImpl bool -var updateIEEE func(crc uint32, p []byte) uint32 -var ieeeOnce sync.Once - -func ieeeInit() { - ieeeArchImpl = archAvailableIEEE() - - if ieeeArchImpl { - archInitIEEE() - updateIEEE = archUpdateIEEE - } else { - // Initialize the slicing-by-8 table. - ieeeTable8 = slicingMakeTable(IEEE) - updateIEEE = func(crc uint32, p []byte) uint32 { - return slicingUpdate(crc, ieeeTable8, p) - } - } -} - -// MakeTable returns a Table constructed from the specified polynomial. -// The contents of this Table must not be modified. -func MakeTable(poly uint32) *Table { - switch poly { - case IEEE: - ieeeOnce.Do(ieeeInit) - return IEEETable - case Castagnoli: - castagnoliOnce.Do(castagnoliInit) - return castagnoliTable - } - return simpleMakeTable(poly) -} - -// digest represents the partial evaluation of a checksum. -type digest struct { - crc uint32 - tab *Table -} - -// New creates a new hash.Hash32 computing the CRC-32 checksum -// using the polynomial represented by the Table. -// Its Sum method will lay the value out in big-endian byte order. -func New(tab *Table) hash.Hash32 { - if tab == IEEETable { - ieeeOnce.Do(ieeeInit) - } - return &digest{0, tab} -} - -// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum -// using the IEEE polynomial. -// Its Sum method will lay the value out in big-endian byte order. -func NewIEEE() hash.Hash32 { return New(IEEETable) } - -func (d *digest) Size() int { return Size } - -func (d *digest) BlockSize() int { return 1 } - -func (d *digest) Reset() { d.crc = 0 } - -// Update returns the result of adding the bytes in p to the crc. -func Update(crc uint32, tab *Table, p []byte) uint32 { - switch tab { - case castagnoliTable: - return updateCastagnoli(crc, p) - case IEEETable: - // Unfortunately, because IEEETable is exported, IEEE may be used without a - // call to MakeTable. We have to make sure it gets initialized in that case. - ieeeOnce.Do(ieeeInit) - return updateIEEE(crc, p) - default: - return simpleUpdate(crc, tab, p) - } -} - -func (d *digest) Write(p []byte) (n int, err error) { - switch d.tab { - case castagnoliTable: - d.crc = updateCastagnoli(d.crc, p) - case IEEETable: - // We only create digest objects through New() which takes care of - // initialization in this case. - d.crc = updateIEEE(d.crc, p) - default: - d.crc = simpleUpdate(d.crc, d.tab, p) - } - return len(p), nil -} - -func (d *digest) Sum32() uint32 { return d.crc } - -func (d *digest) Sum(in []byte) []byte { - s := d.Sum32() - return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) -} - -// Checksum returns the CRC-32 checksum of data -// using the polynomial represented by the Table. -func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } - -// ChecksumIEEE returns the CRC-32 checksum of data -// using the IEEE polynomial. -func ChecksumIEEE(data []byte) uint32 { - ieeeOnce.Do(ieeeInit) - return updateIEEE(0, data) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go deleted file mode 100644 index af2a0b844b..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.go +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine,!gccgo - -// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a -// description of the interface that each architecture-specific file -// implements. - -package crc32 - -import "unsafe" - -// This file contains the code to call the SSE 4.2 version of the Castagnoli -// and IEEE CRC. - -// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use -// CPUID to test for SSE 4.1, 4.2 and CLMUL support. -func haveSSE41() bool -func haveSSE42() bool -func haveCLMUL() bool - -// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42(crc uint32, p []byte) uint32 - -// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42Triple( - crcA, crcB, crcC uint32, - a, b, c []byte, - rounds uint32, -) (retA uint32, retB uint32, retC uint32) - -// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ -// instruction as well as SSE 4.1. -//go:noescape -func ieeeCLMUL(crc uint32, p []byte) uint32 - -var sse42 = haveSSE42() -var useFastIEEE = haveCLMUL() && haveSSE41() - -const castagnoliK1 = 168 -const castagnoliK2 = 1344 - -type sse42Table [4]Table - -var castagnoliSSE42TableK1 *sse42Table -var castagnoliSSE42TableK2 *sse42Table - -func archAvailableCastagnoli() bool { - return sse42 -} - -func archInitCastagnoli() { - if !sse42 { - panic("arch-specific Castagnoli not available") - } - castagnoliSSE42TableK1 = new(sse42Table) - castagnoliSSE42TableK2 = new(sse42Table) - // See description in updateCastagnoli. - // t[0][i] = CRC(i000, O) - // t[1][i] = CRC(0i00, O) - // t[2][i] = CRC(00i0, O) - // t[3][i] = CRC(000i, O) - // where O is a sequence of K zeros. - var tmp [castagnoliK2]byte - for b := 0; b < 4; b++ { - for i := 0; i < 256; i++ { - val := uint32(i) << uint32(b*8) - castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1]) - castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:]) - } - } -} - -// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the -// table given) with the given initial crc value. This corresponds to -// CRC(crc, O) in the description in updateCastagnoli. -func castagnoliShift(table *sse42Table, crc uint32) uint32 { - return table[3][crc>>24] ^ - table[2][(crc>>16)&0xFF] ^ - table[1][(crc>>8)&0xFF] ^ - table[0][crc&0xFF] -} - -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !sse42 { - panic("not available") - } - - // This method is inspired from the algorithm in Intel's white paper: - // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" - // The same strategy of splitting the buffer in three is used but the - // combining calculation is different; the complete derivation is explained - // below. - // - // -- The basic idea -- - // - // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a - // time. In recent Intel architectures the instruction takes 3 cycles; - // however the processor can pipeline up to three instructions if they - // don't depend on each other. - // - // Roughly this means that we can process three buffers in about the same - // time we can process one buffer. - // - // The idea is then to split the buffer in three, CRC the three pieces - // separately and then combine the results. - // - // Combining the results requires precomputed tables, so we must choose a - // fixed buffer length to optimize. The longer the length, the faster; but - // only buffers longer than this length will use the optimization. We choose - // two cutoffs and compute tables for both: - // - one around 512: 168*3=504 - // - one around 4KB: 1344*3=4032 - // - // -- The nitty gritty -- - // - // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with - // initial non-inverted CRC I). This function has the following properties: - // (a) CRC(I, AB) = CRC(CRC(I, A), B) - // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B) - // - // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of - // K bytes each, where K is a fixed constant. Let O be the sequence of K zero - // bytes. - // - // CRC(I, ABC) = CRC(I, ABO xor C) - // = CRC(I, ABO) xor CRC(0, C) - // = CRC(CRC(I, AB), O) xor CRC(0, C) - // = CRC(CRC(I, AO xor B), O) xor CRC(0, C) - // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C) - // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C) - // - // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B), - // and CRC(0, C) efficiently. We just need to find a way to quickly compute - // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these - // values; since we can't have a 32-bit table, we break it up into four - // 8-bit tables: - // - // CRC(uvwx, O) = CRC(u000, O) xor - // CRC(0v00, O) xor - // CRC(00w0, O) xor - // CRC(000x, O) - // - // We can compute tables corresponding to the four terms for all 8-bit - // values. - - crc = ^crc - - // If a buffer is long enough to use the optimization, process the first few - // bytes to align the buffer to an 8 byte boundary (if necessary). - if len(p) >= castagnoliK1*3 { - delta := int(uintptr(unsafe.Pointer(&p[0])) & 7) - if delta != 0 { - delta = 8 - delta - crc = castagnoliSSE42(crc, p[:delta]) - p = p[delta:] - } - } - - // Process 3*K2 at a time. - for len(p) >= castagnoliK2*3 { - // Compute CRC(I, A), CRC(0, B), and CRC(0, C). - crcA, crcB, crcC := castagnoliSSE42Triple( - crc, 0, 0, - p, p[castagnoliK2:], p[castagnoliK2*2:], - castagnoliK2/24) - - // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) - crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB - // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) - crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC - p = p[castagnoliK2*3:] - } - - // Process 3*K1 at a time. - for len(p) >= castagnoliK1*3 { - // Compute CRC(I, A), CRC(0, B), and CRC(0, C). - crcA, crcB, crcC := castagnoliSSE42Triple( - crc, 0, 0, - p, p[castagnoliK1:], p[castagnoliK1*2:], - castagnoliK1/24) - - // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) - crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB - // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) - crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC - p = p[castagnoliK1*3:] - } - - // Use the simple implementation for what's left. - crc = castagnoliSSE42(crc, p) - return ^crc -} - -func archAvailableIEEE() bool { - return useFastIEEE -} - -var archIeeeTable8 *slicing8Table - -func archInitIEEE() { - if !useFastIEEE { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archIeeeTable8 = slicingMakeTable(IEEE) -} - -func archUpdateIEEE(crc uint32, p []byte) uint32 { - if !useFastIEEE { - panic("not available") - } - - if len(p) >= 64 { - left := len(p) & 15 - do := len(p) - left - crc = ^ieeeCLMUL(^crc, p[:do]) - p = p[do:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archIeeeTable8, p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s deleted file mode 100644 index e8a7941ce7..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.s +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build gc - -#define NOSPLIT 4 -#define RODATA 8 - -// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. -// -// func castagnoliSSE42(crc uint32, p []byte) uint32 -TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - // If there are fewer than 8 bytes to process, skip alignment. - CMPQ CX, $8 - JL less_than_8 - - MOVQ SI, BX - ANDQ $7, BX - JZ aligned - - // Process the first few bytes to 8-byte align the input. - - // BX = 8 - BX. We need to process this many bytes to align. - SUBQ $1, BX - XORQ $7, BX - - BTQ $0, BX - JNC align_2 - - CRC32B (SI), AX - DECQ CX - INCQ SI - -align_2: - BTQ $1, BX - JNC align_4 - - // CRC32W (SI), AX - BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - - SUBQ $2, CX - ADDQ $2, SI - -align_4: - BTQ $2, BX - JNC aligned - - // CRC32L (SI), AX - BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - - SUBQ $4, CX - ADDQ $4, SI - -aligned: - // The input is now 8-byte aligned and we can process 8-byte chunks. - CMPQ CX, $8 - JL less_than_8 - - CRC32Q (SI), AX - ADDQ $8, SI - SUBQ $8, CX - JMP aligned - -less_than_8: - // We may have some bytes left over; process 4 bytes, then 2, then 1. - BTQ $2, CX - JNC less_than_4 - - // CRC32L (SI), AX - BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - ADDQ $4, SI - -less_than_4: - BTQ $1, CX - JNC less_than_2 - - // CRC32W (SI), AX - BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 - ADDQ $2, SI - -less_than_2: - BTQ $0, CX - JNC done - - CRC32B (SI), AX - -done: - MOVL AX, ret+32(FP) - RET - -// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) -// bytes from each buffer. -// -// func castagnoliSSE42Triple( -// crc1, crc2, crc3 uint32, -// a, b, c []byte, -// rounds uint32, -// ) (retA uint32, retB uint32, retC uint32) -TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 - MOVL crcA+0(FP), AX - MOVL crcB+4(FP), CX - MOVL crcC+8(FP), DX - - MOVQ a+16(FP), R8 // data pointer - MOVQ b+40(FP), R9 // data pointer - MOVQ c+64(FP), R10 // data pointer - - MOVL rounds+88(FP), R11 - -loop: - CRC32Q (R8), AX - CRC32Q (R9), CX - CRC32Q (R10), DX - - CRC32Q 8(R8), AX - CRC32Q 8(R9), CX - CRC32Q 8(R10), DX - - CRC32Q 16(R8), AX - CRC32Q 16(R9), CX - CRC32Q 16(R10), DX - - ADDQ $24, R8 - ADDQ $24, R9 - ADDQ $24, R10 - - DECQ R11 - JNZ loop - - MOVL AX, retA+96(FP) - MOVL CX, retB+100(FP) - MOVL DX, retC+104(FP) - RET - -// func haveSSE42() bool -TEXT ·haveSSE42(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $20, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// func haveCLMUL() bool -TEXT ·haveCLMUL(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $1, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// func haveSSE41() bool -TEXT ·haveSSE41(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $19, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// CRC32 polynomial data -// -// These constants are lifted from the -// Linux kernel, since they avoid the costly -// PSHUFB 16 byte reversal proposed in the -// original Intel paper. -DATA r2r1kp<>+0(SB)/8, $0x154442bd4 -DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 -DATA r4r3kp<>+0(SB)/8, $0x1751997d0 -DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e -DATA rupolykp<>+0(SB)/8, $0x1db710641 -DATA rupolykp<>+8(SB)/8, $0x1f7011641 -DATA r5kp<>+0(SB)/8, $0x163cd6124 - -GLOBL r2r1kp<>(SB), RODATA, $16 -GLOBL r4r3kp<>(SB), RODATA, $16 -GLOBL rupolykp<>(SB), RODATA, $16 -GLOBL r5kp<>(SB), RODATA, $8 - -// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// len(p) must be at least 64, and must be a multiple of 16. - -// func ieeeCLMUL(crc uint32, p []byte) uint32 -TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 - MOVL crc+0(FP), X0 // Initial CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - MOVOU (SI), X1 - MOVOU 16(SI), X2 - MOVOU 32(SI), X3 - MOVOU 48(SI), X4 - PXOR X0, X1 - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left - JB remain64 - - MOVOA r2r1kp<>+0(SB), X0 - -loopback64: - MOVOA X1, X5 - MOVOA X2, X6 - MOVOA X3, X7 - MOVOA X4, X8 - - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0, X0, X2 - PCLMULQDQ $0, X0, X3 - PCLMULQDQ $0, X0, X4 - - // Load next early - MOVOU (SI), X11 - MOVOU 16(SI), X12 - MOVOU 32(SI), X13 - MOVOU 48(SI), X14 - - PCLMULQDQ $0x11, X0, X5 - PCLMULQDQ $0x11, X0, X6 - PCLMULQDQ $0x11, X0, X7 - PCLMULQDQ $0x11, X0, X8 - - PXOR X5, X1 - PXOR X6, X2 - PXOR X7, X3 - PXOR X8, X4 - - PXOR X11, X1 - PXOR X12, X2 - PXOR X13, X3 - PXOR X14, X4 - - ADDQ $0x40, DI - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left? - JGE loopback64 - - // Fold result into a single register (X1) -remain64: - MOVOA r4r3kp<>+0(SB), X0 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X2, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X3, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X4, X1 - - // If there is less than 16 bytes left we are done - CMPQ CX, $16 - JB finish - - // Encode 16 bytes -remain16: - MOVOU (SI), X10 - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X10, X1 - SUBQ $16, CX - ADDQ $16, SI - CMPQ CX, $16 - JGE remain16 - -finish: - // Fold final result into 32 bits and return it - PCMPEQB X3, X3 - PCLMULQDQ $1, X1, X0 - PSRLDQ $8, X1 - PXOR X0, X1 - - MOVOA X1, X2 - MOVQ r5kp<>+0(SB), X0 - - // Creates 32 bit mask. Note that we don't care about upper half. - PSRLQ $32, X3 - - PSRLDQ $4, X2 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - MOVOA rupolykp<>+0(SB), X0 - - MOVOA X1, X2 - PAND X3, X1 - PCLMULQDQ $0x10, X0, X1 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - // PEXTRD $1, X1, AX (SSE 4.1) - BYTE $0x66; BYTE $0x0f; BYTE $0x3a - BYTE $0x16; BYTE $0xc8; BYTE $0x01 - MOVL AX, ret+32(FP) - - RET diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go deleted file mode 100644 index 3222b06a5a..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine,!gccgo - -package crc32 - -// This file contains the code to call the SSE 4.2 version of the Castagnoli -// CRC. - -// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2 -// support. -func haveSSE42() bool - -// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32 -// instruction. -//go:noescape -func castagnoliSSE42(crc uint32, p []byte) uint32 - -var sse42 = haveSSE42() - -func archAvailableCastagnoli() bool { - return sse42 -} - -func archInitCastagnoli() { - if !sse42 { - panic("not available") - } - // No initialization necessary. -} - -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !sse42 { - panic("not available") - } - return castagnoliSSE42(crc, p) -} - -func archAvailableIEEE() bool { return false } -func archInitIEEE() { panic("not available") } -func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s deleted file mode 100644 index a578d685cc..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build gc - -#define NOSPLIT 4 -#define RODATA 8 - -// func castagnoliSSE42(crc uint32, p []byte) uint32 -TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // CRC value - MOVL p+4(FP), SI // data pointer - MOVL p_len+8(FP), CX // len(p) - - NOTL AX - - // If there's less than 8 bytes to process, we do it byte-by-byte. - CMPQ CX, $8 - JL cleanup - - // Process individual bytes until the input is 8-byte aligned. -startup: - MOVQ SI, BX - ANDQ $7, BX - JZ aligned - - CRC32B (SI), AX - DECQ CX - INCQ SI - JMP startup - -aligned: - // The input is now 8-byte aligned and we can process 8-byte chunks. - CMPQ CX, $8 - JL cleanup - - CRC32Q (SI), AX - ADDQ $8, SI - SUBQ $8, CX - JMP aligned - -cleanup: - // We may have some bytes left over that we process one at a time. - CMPQ CX, $0 - JE done - - CRC32B (SI), AX - INCQ SI - DECQ CX - JMP cleanup - -done: - NOTL AX - MOVL AX, ret+16(FP) - RET - -// func haveSSE42() bool -TEXT ·haveSSE42(SB), NOSPLIT, $0 - XORQ AX, AX - INCL AX - CPUID - SHRQ $20, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go deleted file mode 100644 index abacbb663d..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_generic.go +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file contains CRC32 algorithms that are not specific to any architecture -// and don't use hardware acceleration. -// -// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table. -// -// The slicing-by-8 algorithm is a faster implementation that uses a bigger -// table (8*256*4 bytes). - -package crc32 - -// simpleMakeTable allocates and constructs a Table for the specified -// polynomial. The table is suitable for use with the simple algorithm -// (simpleUpdate). -func simpleMakeTable(poly uint32) *Table { - t := new(Table) - simplePopulateTable(poly, t) - return t -} - -// simplePopulateTable constructs a Table for the specified polynomial, suitable -// for use with simpleUpdate. -func simplePopulateTable(poly uint32, t *Table) { - for i := 0; i < 256; i++ { - crc := uint32(i) - for j := 0; j < 8; j++ { - if crc&1 == 1 { - crc = (crc >> 1) ^ poly - } else { - crc >>= 1 - } - } - t[i] = crc - } -} - -// simpleUpdate uses the simple algorithm to update the CRC, given a table that -// was previously computed using simpleMakeTable. -func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 { - crc = ^crc - for _, v := range p { - crc = tab[byte(crc)^v] ^ (crc >> 8) - } - return ^crc -} - -// Use slicing-by-8 when payload >= this value. -const slicing8Cutoff = 16 - -// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm. -type slicing8Table [8]Table - -// slicingMakeTable constructs a slicing8Table for the specified polynomial. The -// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate). -func slicingMakeTable(poly uint32) *slicing8Table { - t := new(slicing8Table) - simplePopulateTable(poly, &t[0]) - for i := 0; i < 256; i++ { - crc := t[0][i] - for j := 1; j < 8; j++ { - crc = t[0][crc&0xFF] ^ (crc >> 8) - t[j][i] = crc - } - } - return t -} - -// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a -// table that was previously computed using slicingMakeTable. -func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 { - if len(p) >= slicing8Cutoff { - crc = ^crc - for len(p) > 8 { - crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 - crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ - tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ - tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] - p = p[8:] - } - crc = ^crc - } - if len(p) == 0 { - return crc - } - return simpleUpdate(crc, &tab[0], p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go deleted file mode 100644 index cc960764bc..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_otherarch.go +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64,!amd64p32,!s390x - -package crc32 - -func archAvailableIEEE() bool { return false } -func archInitIEEE() { panic("not available") } -func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } - -func archAvailableCastagnoli() bool { return false } -func archInitCastagnoli() { panic("not available") } -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go deleted file mode 100644 index ce96f03281..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_s390x.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build s390x - -package crc32 - -const ( - vxMinLen = 64 - vxAlignMask = 15 // align to 16 bytes -) - -// hasVectorFacility reports whether the machine has the z/Architecture -// vector facility installed and enabled. -func hasVectorFacility() bool - -var hasVX = hasVectorFacility() - -// vectorizedCastagnoli implements CRC32 using vector instructions. -// It is defined in crc32_s390x.s. -//go:noescape -func vectorizedCastagnoli(crc uint32, p []byte) uint32 - -// vectorizedIEEE implements CRC32 using vector instructions. -// It is defined in crc32_s390x.s. -//go:noescape -func vectorizedIEEE(crc uint32, p []byte) uint32 - -func archAvailableCastagnoli() bool { - return hasVX -} - -var archCastagnoliTable8 *slicing8Table - -func archInitCastagnoli() { - if !hasVX { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archCastagnoliTable8 = slicingMakeTable(Castagnoli) -} - -// archUpdateCastagnoli calculates the checksum of p using -// vectorizedCastagnoli. -func archUpdateCastagnoli(crc uint32, p []byte) uint32 { - if !hasVX { - panic("not available") - } - // Use vectorized function if data length is above threshold. - if len(p) >= vxMinLen { - aligned := len(p) & ^vxAlignMask - crc = vectorizedCastagnoli(crc, p[:aligned]) - p = p[aligned:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archCastagnoliTable8, p) -} - -func archAvailableIEEE() bool { - return hasVX -} - -var archIeeeTable8 *slicing8Table - -func archInitIEEE() { - if !hasVX { - panic("not available") - } - // We still use slicing-by-8 for small buffers. - archIeeeTable8 = slicingMakeTable(IEEE) -} - -// archUpdateIEEE calculates the checksum of p using vectorizedIEEE. -func archUpdateIEEE(crc uint32, p []byte) uint32 { - if !hasVX { - panic("not available") - } - // Use vectorized function if data length is above threshold. - if len(p) >= vxMinLen { - aligned := len(p) & ^vxAlignMask - crc = vectorizedIEEE(crc, p[:aligned]) - p = p[aligned:] - } - if len(p) == 0 { - return crc - } - return slicingUpdate(crc, archIeeeTable8, p) -} diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s deleted file mode 100644 index e980ca29d6..0000000000 --- a/vendor/github.com/klauspost/crc32/crc32_s390x.s +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build s390x - -#include "textflag.h" - -// Vector register range containing CRC-32 constants - -#define CONST_PERM_LE2BE V9 -#define CONST_R2R1 V10 -#define CONST_R4R3 V11 -#define CONST_R5 V12 -#define CONST_RU_POLY V13 -#define CONST_CRC_POLY V14 - -// The CRC-32 constant block contains reduction constants to fold and -// process particular chunks of the input data stream in parallel. -// -// Note that the constant definitions below are extended in order to compute -// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. -// The rightmost doubleword can be 0 to prevent contribution to the result or -// can be multiplied by 1 to perform an XOR without the need for a separate -// VECTOR EXCLUSIVE OR instruction. -// -// The polynomials used are bit-reflected: -// -// IEEE: P'(x) = 0x0edb88320 -// Castagnoli: P'(x) = 0x082f63b78 - -// IEEE polynomial constants -DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask -DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 -DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 -DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 -DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 -DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 -DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 -DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' -DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 -DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 - -GLOBL ·crcleconskp(SB), RODATA, $144 - -// Castagonli Polynomial constants -DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask -DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 -DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 -DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 -DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 -DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 -DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 -DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' -DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 -DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 - -GLOBL ·crccleconskp(SB), RODATA, $144 - -// func hasVectorFacility() bool -TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 - MOVD $x-24(SP), R1 - XC $24, 0(R1), 0(R1) // clear the storage - MOVD $2, R0 // R0 is the number of double words stored -1 - WORD $0xB2B01000 // STFLE 0(R1) - XOR R0, R0 // reset the value of R0 - MOVBZ z-8(SP), R1 - AND $0x40, R1 - BEQ novector - -vectorinstalled: - // check if the vector instruction has been enabled - VLEIB $0, $0xF, V16 - VLGVB $0, V16, R1 - CMPBNE R1, $0xF, novector - MOVB $1, ret+0(FP) // have vx - RET - -novector: - MOVB $0, ret+0(FP) // no vx - RET - -// The CRC-32 function(s) use these calling conventions: -// -// Parameters: -// -// R2: Initial CRC value, typically ~0; and final CRC (return) value. -// R3: Input buffer pointer, performance might be improved if the -// buffer is on a doubleword boundary. -// R4: Length of the buffer, must be 64 bytes or greater. -// -// Register usage: -// -// R5: CRC-32 constant pool base pointer. -// V0: Initial CRC value and intermediate constants and results. -// V1..V4: Data for CRC computation. -// V5..V8: Next data chunks that are fetched from the input buffer. -// -// V9..V14: CRC-32 constants. - -// func vectorizedIEEE(crc uint32, p []byte) uint32 -TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 - MOVWZ crc+0(FP), R2 // R2 stores the CRC value - MOVD p+8(FP), R3 // data pointer - MOVD p_len+16(FP), R4 // len(p) - - MOVD $·crcleconskp(SB), R5 - BR vectorizedBody<>(SB) - -// func vectorizedCastagnoli(crc uint32, p []byte) uint32 -TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 - MOVWZ crc+0(FP), R2 // R2 stores the CRC value - MOVD p+8(FP), R3 // data pointer - MOVD p_len+16(FP), R4 // len(p) - - // R5: crc-32 constant pool base pointer, constant is used to reduce crc - MOVD $·crccleconskp(SB), R5 - BR vectorizedBody<>(SB) - -TEXT vectorizedBody<>(SB), NOSPLIT, $0 - XOR $0xffffffff, R2 // NOTW R2 - VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY - - // Load the initial CRC value into the rightmost word of V0 - VZERO V0 - VLVGF $3, R2, V0 - - // Crash if the input size is less than 64-bytes. - CMP R4, $64 - BLT crash - - // Load a 64-byte data chunk and XOR with CRC - VLM 0(R3), V1, V4 // 64-bytes into V1..V4 - - // Reflect the data if the CRC operation is in the bit-reflected domain - VPERM V1, V1, CONST_PERM_LE2BE, V1 - VPERM V2, V2, CONST_PERM_LE2BE, V2 - VPERM V3, V3, CONST_PERM_LE2BE, V3 - VPERM V4, V4, CONST_PERM_LE2BE, V4 - - VX V0, V1, V1 // V1 ^= CRC - ADD $64, R3 // BUF = BUF + 64 - ADD $(-64), R4 - - // Check remaining buffer size and jump to proper folding method - CMP R4, $64 - BLT less_than_64bytes - -fold_64bytes_loop: - // Load the next 64-byte data chunk into V5 to V8 - VLM 0(R3), V5, V8 - VPERM V5, V5, CONST_PERM_LE2BE, V5 - VPERM V6, V6, CONST_PERM_LE2BE, V6 - VPERM V7, V7, CONST_PERM_LE2BE, V7 - VPERM V8, V8, CONST_PERM_LE2BE, V8 - - // Perform a GF(2) multiplication of the doublewords in V1 with - // the reduction constants in V0. The intermediate result is - // then folded (accumulated) with the next data chunk in V5 and - // stored in V1. Repeat this step for the register contents - // in V2, V3, and V4 respectively. - - VGFMAG CONST_R2R1, V1, V5, V1 - VGFMAG CONST_R2R1, V2, V6, V2 - VGFMAG CONST_R2R1, V3, V7, V3 - VGFMAG CONST_R2R1, V4, V8, V4 - - // Adjust buffer pointer and length for next loop - ADD $64, R3 // BUF = BUF + 64 - ADD $(-64), R4 // LEN = LEN - 64 - - CMP R4, $64 - BGE fold_64bytes_loop - -less_than_64bytes: - // Fold V1 to V4 into a single 128-bit value in V1 - VGFMAG CONST_R4R3, V1, V2, V1 - VGFMAG CONST_R4R3, V1, V3, V1 - VGFMAG CONST_R4R3, V1, V4, V1 - - // Check whether to continue with 64-bit folding - CMP R4, $16 - BLT final_fold - -fold_16bytes_loop: - VL 0(R3), V2 // Load next data chunk - VPERM V2, V2, CONST_PERM_LE2BE, V2 - - VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk - - // Adjust buffer pointer and size for folding next data chunk - ADD $16, R3 - ADD $-16, R4 - - // Process remaining data chunks - CMP R4, $16 - BGE fold_16bytes_loop - -final_fold: - VLEIB $7, $0x40, V9 - VSRLB V9, CONST_R4R3, V0 - VLEIG $0, $1, V0 - - VGFMG V0, V1, V1 - - VLEIB $7, $0x20, V9 // Shift by words - VSRLB V9, V1, V2 // Store remaining bits in V2 - VUPLLF V1, V1 // Split rightmost doubleword - VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 - - // The input values to the Barret reduction are the degree-63 polynomial - // in V1 (R(x)), degree-32 generator polynomial, and the reduction - // constant u. The Barret reduction result is the CRC value of R(x) mod - // P(x). - // - // The Barret reduction algorithm is defined as: - // - // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u - // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) - // 3. C(x) = R(x) XOR T2(x) mod x^32 - // - // Note: To compensate the division by x^32, use the vector unpack - // instruction to move the leftmost word into the leftmost doubleword - // of the vector register. The rightmost doubleword is multiplied - // with zero to not contribute to the intermedate results. - - // T1(x) = floor( R(x) / x^32 ) GF2MUL u - VUPLLF V1, V2 - VGFMG CONST_RU_POLY, V2, V2 - - // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in - // V2 and XOR the intermediate result, T2(x), with the value in V1. - // The final result is in the rightmost word of V2. - - VUPLLF V2, V2 - VGFMAG CONST_CRC_POLY, V2, V1, V2 - -done: - VLGVF $2, V2, R2 - XOR $0xffffffff, R2 // NOTW R2 - MOVWZ R2, ret + 32(FP) - RET - -crash: - MOVD $0, (R0) // input size is less than 64-bytes |