]> source.dussan.org Git - gitea.git/commitdiff
Move modules/gzip to gitea.com/macaron/gzip (#9058)
authorLunny Xiao <xiaolunwen@gmail.com>
Mon, 18 Nov 2019 05:18:33 +0000 (13:18 +0800)
committerGitHub <noreply@github.com>
Mon, 18 Nov 2019 05:18:33 +0000 (13:18 +0800)
* Move modules/gzip to gitea.com/macaron/gzip

* Fix vendor

55 files changed:
go.mod
go.sum
integrations/lfs_getobject_test.go
modules/gzip/gzip.go [deleted file]
modules/gzip/gzip_test.go [deleted file]
routers/routes/routes.go
vendor/gitea.com/macaron/gzip/go.mod [new file with mode: 0644]
vendor/gitea.com/macaron/gzip/go.sum [new file with mode: 0644]
vendor/gitea.com/macaron/gzip/gzip.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/LICENSE
vendor/github.com/klauspost/compress/flate/copy.go [deleted file]
vendor/github.com/klauspost/compress/flate/crc32_amd64.go [deleted file]
vendor/github.com/klauspost/compress/flate/crc32_amd64.s [deleted file]
vendor/github.com/klauspost/compress/flate/crc32_noasm.go [deleted file]
vendor/github.com/klauspost/compress/flate/deflate.go
vendor/github.com/klauspost/compress/flate/fast_encoder.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
vendor/github.com/klauspost/compress/flate/huffman_code.go
vendor/github.com/klauspost/compress/flate/inflate.go
vendor/github.com/klauspost/compress/flate/level1.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/level2.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/level3.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/level4.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/level5.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/level6.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/reverse_bits.go [deleted file]
vendor/github.com/klauspost/compress/flate/snappy.go [deleted file]
vendor/github.com/klauspost/compress/flate/stateless.go [new file with mode: 0644]
vendor/github.com/klauspost/compress/flate/token.go
vendor/github.com/klauspost/compress/gzip/gunzip.go
vendor/github.com/klauspost/compress/gzip/gzip.go
vendor/github.com/klauspost/cpuid/.gitignore [deleted file]
vendor/github.com/klauspost/cpuid/.travis.yml [deleted file]
vendor/github.com/klauspost/cpuid/LICENSE [deleted file]
vendor/github.com/klauspost/cpuid/README.md [deleted file]
vendor/github.com/klauspost/cpuid/cpuid.go [deleted file]
vendor/github.com/klauspost/cpuid/cpuid_386.s [deleted file]
vendor/github.com/klauspost/cpuid/cpuid_amd64.s [deleted file]
vendor/github.com/klauspost/cpuid/detect_intel.go [deleted file]
vendor/github.com/klauspost/cpuid/detect_ref.go [deleted file]
vendor/github.com/klauspost/cpuid/generate.go [deleted file]
vendor/github.com/klauspost/crc32/.gitignore [deleted file]
vendor/github.com/klauspost/crc32/.travis.yml [deleted file]
vendor/github.com/klauspost/crc32/LICENSE [deleted file]
vendor/github.com/klauspost/crc32/README.md [deleted file]
vendor/github.com/klauspost/crc32/crc32.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_amd64.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_amd64.s [deleted file]
vendor/github.com/klauspost/crc32/crc32_amd64p32.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_amd64p32.s [deleted file]
vendor/github.com/klauspost/crc32/crc32_generic.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_otherarch.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_s390x.go [deleted file]
vendor/github.com/klauspost/crc32/crc32_s390x.s [deleted file]
vendor/modules.txt

diff --git a/go.mod b/go.mod
index 2368a25bc13384a1310fd53a43d70cc64f1fb81d..64cc079b3570daa97db36544f91eda15e91669d5 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
        gitea.com/macaron/captcha v0.0.0-20190822015246-daa973478bae
        gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175
        gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
+       gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5
        gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
        gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
        gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
@@ -55,9 +56,7 @@ require (
        github.com/joho/godotenv v1.3.0 // indirect
        github.com/kballard/go-shellquote v0.0.0-20170619183022-cd60e84ee657
        github.com/keybase/go-crypto v0.0.0-20170605145657-00ac4db533f6
-       github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f
-       github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc // indirect
-       github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6 // indirect
+       github.com/klauspost/compress v1.9.2
        github.com/lafriks/xormstore v1.3.2
        github.com/lib/pq v1.2.0
        github.com/lunny/dingtalk_webhook v0.0.0-20171025031554-e3534c89ef96
diff --git a/go.sum b/go.sum
index 83b2152114d87b65e5d13ce630bc98d027e5979a..24b7f6f92ec4a6198bf0fb1c775c9f0c2ba03846 100644 (file)
--- a/go.sum
+++ b/go.sum
@@ -20,6 +20,8 @@ gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175 h1:ikzdAGB6SsUGByW5wKl
 gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175/go.mod h1:rtOK4J20kpMD9XcNsnO5YA843YSTe/MUMbDj/TJ/Q7A=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439 h1:88c34YM29a1GlWLrLBaG/GTT2htDdJz1u3n9+lmPolg=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439/go.mod h1:IsQPHx73HnnqFBYiVHjg87q4XBZyGXXu77xANukvZuk=
+gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5 h1:G/a7r0r2jEelSynBlv1+PAEZQKfsdRHQUMb1PlNvemM=
+gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5/go.mod h1:jGHtoovArcQj+sw7NJxyPgjuRxOSG9a/oFu3VkLRTKQ=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223 h1:iZWwQif/LHMjBgfY/ua8CFVa4XMDfbbs7EZ0Q1dYguU=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223/go.mod h1:+qsc10s4hBsHKU/9luGGumFh4m5FFVc7uih+8/mM1NY=
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
@@ -334,12 +336,8 @@ github.com/keybase/go-crypto v0.0.0-20170605145657-00ac4db533f6/go.mod h1:ghbZsc
 github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f h1:tCnZKEmDovgV4jmsclh6CuKk9AMzTzyVWfejgkgccVg=
-github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc h1:WW8B7p7QBnFlqRVv/k6ro/S8Z7tCnYjJHcQNScx9YVs=
-github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
-github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6 h1:KAZ1BW2TCmT6PRihDPpocIy1QTtsAsrx6TneU/4+CMg=
-github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
+github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY=
+github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
index 373fffa4453bdb8ac70c989a568a52090c57e49e..e9c1d1178fcf452d4ff729c51d1a18002c429622 100644 (file)
@@ -15,10 +15,10 @@ import (
        "testing"
 
        "code.gitea.io/gitea/models"
-       "code.gitea.io/gitea/modules/gzip"
        "code.gitea.io/gitea/modules/lfs"
        "code.gitea.io/gitea/modules/setting"
 
+       "gitea.com/macaron/gzip"
        gzipp "github.com/klauspost/compress/gzip"
        "github.com/stretchr/testify/assert"
 )
diff --git a/modules/gzip/gzip.go b/modules/gzip/gzip.go
deleted file mode 100644 (file)
index 9573d16..0000000
+++ /dev/null
@@ -1,358 +0,0 @@
-// Copyright 2019 The Gitea Authors. All rights reserved.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package gzip
-
-import (
-       "bufio"
-       "fmt"
-       "io"
-       "net"
-       "net/http"
-       "regexp"
-       "strconv"
-       "strings"
-       "sync"
-
-       "gitea.com/macaron/macaron"
-       "github.com/klauspost/compress/gzip"
-)
-
-const (
-       acceptEncodingHeader  = "Accept-Encoding"
-       contentEncodingHeader = "Content-Encoding"
-       contentLengthHeader   = "Content-Length"
-       contentTypeHeader     = "Content-Type"
-       rangeHeader           = "Range"
-       varyHeader            = "Vary"
-)
-
-const (
-       // MinSize is the minimum size of content we will compress
-       MinSize = 1400
-)
-
-// noopClosers are io.Writers with a shim to prevent early closure
-type noopCloser struct {
-       io.Writer
-}
-
-func (noopCloser) Close() error { return nil }
-
-// WriterPool is a gzip writer pool to reduce workload on creation of
-// gzip writers
-type WriterPool struct {
-       pool             sync.Pool
-       compressionLevel int
-}
-
-// NewWriterPool creates a new pool
-func NewWriterPool(compressionLevel int) *WriterPool {
-       return &WriterPool{pool: sync.Pool{
-               // New will return nil, we'll manage the creation of new
-               // writers in the middleware
-               New: func() interface{} { return nil },
-       },
-               compressionLevel: compressionLevel}
-}
-
-// Get a writer from the pool - or create one if not available
-func (wp *WriterPool) Get(rw macaron.ResponseWriter) *gzip.Writer {
-       ret := wp.pool.Get()
-       if ret == nil {
-               ret, _ = gzip.NewWriterLevel(rw, wp.compressionLevel)
-       } else {
-               ret.(*gzip.Writer).Reset(rw)
-       }
-       return ret.(*gzip.Writer)
-}
-
-// Put returns a writer to the pool
-func (wp *WriterPool) Put(w *gzip.Writer) {
-       wp.pool.Put(w)
-}
-
-var writerPool WriterPool
-
-// Options represents the configuration for the gzip middleware
-type Options struct {
-       CompressionLevel int
-}
-
-func validateCompressionLevel(level int) bool {
-       return level == gzip.DefaultCompression ||
-               level == gzip.ConstantCompression ||
-               (level >= gzip.BestSpeed && level <= gzip.BestCompression)
-}
-
-func validate(options []Options) Options {
-       // Default to level 4 compression (Best results seem to be between 4 and 6)
-       opt := Options{CompressionLevel: 4}
-       if len(options) > 0 {
-               opt = options[0]
-       }
-       if !validateCompressionLevel(opt.CompressionLevel) {
-               opt.CompressionLevel = 4
-       }
-       return opt
-}
-
-// Middleware creates a macaron.Handler to proxy the response
-func Middleware(options ...Options) macaron.Handler {
-       opt := validate(options)
-       writerPool = *NewWriterPool(opt.CompressionLevel)
-       regex := regexp.MustCompile(`bytes=(\d+)\-.*`)
-
-       return func(ctx *macaron.Context) {
-               // If the client won't accept gzip or x-gzip don't compress
-               if !strings.Contains(ctx.Req.Header.Get(acceptEncodingHeader), "gzip") &&
-                       !strings.Contains(ctx.Req.Header.Get(acceptEncodingHeader), "x-gzip") {
-                       return
-               }
-
-               // If the client is asking for a specific range of bytes - don't compress
-               if rangeHdr := ctx.Req.Header.Get(rangeHeader); rangeHdr != "" {
-
-                       match := regex.FindStringSubmatch(rangeHdr)
-                       if len(match) > 1 {
-                               return
-                       }
-               }
-
-               // OK we should proxy the response writer
-               // We are still not necessarily going to compress...
-               proxyWriter := &ProxyResponseWriter{
-                       internal: ctx.Resp,
-               }
-               defer proxyWriter.Close()
-
-               ctx.Resp = proxyWriter
-               ctx.MapTo(proxyWriter, (*http.ResponseWriter)(nil))
-
-               // Check if render middleware has been registered,
-               // if yes, we need to modify ResponseWriter for it as well.
-               if _, ok := ctx.Render.(*macaron.DummyRender); !ok {
-                       ctx.Render.SetResponseWriter(proxyWriter)
-               }
-
-               ctx.Next()
-               ctx.Resp = proxyWriter.internal
-       }
-}
-
-// ProxyResponseWriter is a wrapped macaron ResponseWriter that may compress its contents
-type ProxyResponseWriter struct {
-       writer   io.WriteCloser
-       internal macaron.ResponseWriter
-       stopped  bool
-
-       code int
-       buf  []byte
-}
-
-// Header returns the header map
-func (proxy *ProxyResponseWriter) Header() http.Header {
-       return proxy.internal.Header()
-}
-
-// Status returns the status code of the response or 0 if the response has not been written.
-func (proxy *ProxyResponseWriter) Status() int {
-       if proxy.code != 0 {
-               return proxy.code
-       }
-       return proxy.internal.Status()
-}
-
-// Written returns whether or not the ResponseWriter has been written.
-func (proxy *ProxyResponseWriter) Written() bool {
-       if proxy.code != 0 {
-               return true
-       }
-       return proxy.internal.Written()
-}
-
-// Size returns the size of the response body.
-func (proxy *ProxyResponseWriter) Size() int {
-       return proxy.internal.Size()
-}
-
-// Before allows for a function to be called before the ResponseWriter has been written to. This is
-// useful for setting headers or any other operations that must happen before a response has been written.
-func (proxy *ProxyResponseWriter) Before(before macaron.BeforeFunc) {
-       proxy.internal.Before(before)
-}
-
-// Write appends data to the proxied gzip writer.
-func (proxy *ProxyResponseWriter) Write(b []byte) (int, error) {
-       // if writer is initialized, use the writer
-       if proxy.writer != nil {
-               return proxy.writer.Write(b)
-       }
-
-       proxy.buf = append(proxy.buf, b...)
-
-       var (
-               contentLength, _ = strconv.Atoi(proxy.Header().Get(contentLengthHeader))
-               contentType      = proxy.Header().Get(contentTypeHeader)
-               contentEncoding  = proxy.Header().Get(contentEncodingHeader)
-       )
-
-       // OK if an encoding hasn't been chosen, and content length > 1400
-       // and content type isn't a compressed type
-       if contentEncoding == "" &&
-               (contentLength == 0 || contentLength >= MinSize) &&
-               (contentType == "" || !compressedContentType(contentType)) {
-               // If current buffer is less than the min size and a Content-Length isn't set, then wait
-               if len(proxy.buf) < MinSize && contentLength == 0 {
-                       return len(b), nil
-               }
-
-               // If the Content-Length is larger than minSize or the current buffer is larger than minSize, then continue.
-               if contentLength >= MinSize || len(proxy.buf) >= MinSize {
-                       // if we don't know the content type, infer it
-                       if contentType == "" {
-                               contentType = http.DetectContentType(proxy.buf)
-                               proxy.Header().Set(contentTypeHeader, contentType)
-                       }
-                       // If the Content-Type is not compressed - Compress!
-                       if !compressedContentType(contentType) {
-                               if err := proxy.startGzip(); err != nil {
-                                       return 0, err
-                               }
-                               return len(b), nil
-                       }
-               }
-       }
-       // If we got here, we should not GZIP this response.
-       if err := proxy.startPlain(); err != nil {
-               return 0, err
-       }
-       return len(b), nil
-}
-
-func (proxy *ProxyResponseWriter) startGzip() error {
-       // Set the content-encoding and vary headers.
-       proxy.Header().Set(contentEncodingHeader, "gzip")
-       proxy.Header().Set(varyHeader, acceptEncodingHeader)
-
-       // if the Content-Length is already set, then calls to Write on gzip
-       // will fail to set the Content-Length header since its already set
-       // See: https://github.com/golang/go/issues/14975.
-       proxy.Header().Del(contentLengthHeader)
-
-       // Write the header to gzip response.
-       if proxy.code != 0 {
-               proxy.internal.WriteHeader(proxy.code)
-               // Ensure that no other WriteHeader's happen
-               proxy.code = 0
-       }
-
-       // Initialize and flush the buffer into the gzip response if there are any bytes.
-       // If there aren't any, we shouldn't initialize it yet because on Close it will
-       // write the gzip header even if nothing was ever written.
-       if len(proxy.buf) > 0 {
-               // Initialize the GZIP response.
-               proxy.writer = writerPool.Get(proxy.internal)
-
-               return proxy.writeBuf()
-       }
-       return nil
-}
-
-func (proxy *ProxyResponseWriter) startPlain() error {
-       if proxy.code != 0 {
-               proxy.internal.WriteHeader(proxy.code)
-               proxy.code = 0
-       }
-       proxy.stopped = true
-       proxy.writer = noopCloser{proxy.internal}
-       return proxy.writeBuf()
-}
-
-func (proxy *ProxyResponseWriter) writeBuf() error {
-       if proxy.buf == nil {
-               return nil
-       }
-
-       n, err := proxy.writer.Write(proxy.buf)
-
-       // This should never happen (per io.Writer docs), but if the write didn't
-       // accept the entire buffer but returned no specific error, we have no clue
-       // what's going on, so abort just to be safe.
-       if err == nil && n < len(proxy.buf) {
-               err = io.ErrShortWrite
-       }
-       proxy.buf = nil
-       return err
-}
-
-// WriteHeader will ensure that we have setup the writer before we write the header
-func (proxy *ProxyResponseWriter) WriteHeader(code int) {
-       if proxy.code == 0 {
-               proxy.code = code
-       }
-}
-
-// Close the writer
-func (proxy *ProxyResponseWriter) Close() error {
-       if proxy.stopped {
-               return nil
-       }
-
-       if proxy.writer == nil {
-               err := proxy.startPlain()
-               if err != nil {
-                       return fmt.Errorf("GzipMiddleware: write to regular responseWriter at close gets error: %q", err.Error())
-               }
-       }
-
-       err := proxy.writer.Close()
-
-       if poolWriter, ok := proxy.writer.(*gzip.Writer); ok {
-               writerPool.Put(poolWriter)
-       }
-
-       proxy.writer = nil
-       proxy.stopped = true
-       return err
-}
-
-// Flush the writer
-func (proxy *ProxyResponseWriter) Flush() {
-       if proxy.writer == nil {
-               return
-       }
-
-       if gw, ok := proxy.writer.(*gzip.Writer); ok {
-               gw.Flush()
-       }
-
-       proxy.internal.Flush()
-}
-
-// Hijack implements http.Hijacker. If the underlying ResponseWriter is a
-// Hijacker, its Hijack method is returned. Otherwise an error is returned.
-func (proxy *ProxyResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
-       hijacker, ok := proxy.internal.(http.Hijacker)
-       if !ok {
-               return nil, nil, fmt.Errorf("the ResponseWriter doesn't support the Hijacker interface")
-       }
-       return hijacker.Hijack()
-}
-
-// verify Hijacker interface implementation
-var _ http.Hijacker = &ProxyResponseWriter{}
-
-func compressedContentType(contentType string) bool {
-       switch contentType {
-       case "application/zip":
-               return true
-       case "application/x-gzip":
-               return true
-       case "application/gzip":
-               return true
-       default:
-               return false
-       }
-}
diff --git a/modules/gzip/gzip_test.go b/modules/gzip/gzip_test.go
deleted file mode 100644 (file)
index 5fc56cc..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2019 The Gitea Authors. All rights reserved.
-// Use of this source code is governed by a MIT-style
-// license that can be found in the LICENSE file.
-
-package gzip
-
-import (
-       "archive/zip"
-       "bytes"
-       "io/ioutil"
-       "net/http"
-       "net/http/httptest"
-       "testing"
-
-       "gitea.com/macaron/macaron"
-       gzipp "github.com/klauspost/compress/gzip"
-       "github.com/stretchr/testify/assert"
-)
-
-func setup(sampleResponse []byte) (*macaron.Macaron, *[]byte) {
-       m := macaron.New()
-       m.Use(Middleware())
-       m.Get("/", func() *[]byte { return &sampleResponse })
-       return m, &sampleResponse
-}
-
-func reqNoAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte) {
-       // Request without accept gzip: Should not gzip
-       resp := httptest.NewRecorder()
-       req, err := http.NewRequest("GET", "/", nil)
-       assert.NoError(t, err)
-       m.ServeHTTP(resp, req)
-
-       _, ok := resp.HeaderMap[contentEncodingHeader]
-       assert.False(t, ok)
-
-       contentEncoding := resp.Header().Get(contentEncodingHeader)
-       assert.NotContains(t, contentEncoding, "gzip")
-
-       result := resp.Body.Bytes()
-       assert.Equal(t, *sampleResponse, result)
-}
-
-func reqAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte, expectGzip bool) {
-       // Request without accept gzip: Should not gzip
-       resp := httptest.NewRecorder()
-       req, err := http.NewRequest("GET", "/", nil)
-       assert.NoError(t, err)
-       req.Header.Set(acceptEncodingHeader, "gzip")
-       m.ServeHTTP(resp, req)
-
-       _, ok := resp.HeaderMap[contentEncodingHeader]
-       assert.Equal(t, ok, expectGzip)
-
-       contentEncoding := resp.Header().Get(contentEncodingHeader)
-       if expectGzip {
-               assert.Contains(t, contentEncoding, "gzip")
-               gzippReader, err := gzipp.NewReader(resp.Body)
-               assert.NoError(t, err)
-               result, err := ioutil.ReadAll(gzippReader)
-               assert.NoError(t, err)
-               assert.Equal(t, *sampleResponse, result)
-       } else {
-               assert.NotContains(t, contentEncoding, "gzip")
-               result := resp.Body.Bytes()
-               assert.Equal(t, *sampleResponse, result)
-       }
-}
-
-func TestMiddlewareSmall(t *testing.T) {
-       m, sampleResponse := setup([]byte("Small response"))
-
-       reqNoAcceptGzip(t, m, sampleResponse)
-
-       reqAcceptGzip(t, m, sampleResponse, false)
-}
-
-func TestMiddlewareLarge(t *testing.T) {
-       b := make([]byte, MinSize+1)
-       for i := range b {
-               b[i] = byte(i % 256)
-       }
-       m, sampleResponse := setup(b)
-
-       reqNoAcceptGzip(t, m, sampleResponse)
-
-       // This should be gzipped as we accept gzip
-       reqAcceptGzip(t, m, sampleResponse, true)
-}
-
-func TestMiddlewareGzip(t *testing.T) {
-       b := make([]byte, MinSize*10)
-       for i := range b {
-               b[i] = byte(i % 256)
-       }
-       outputBuffer := bytes.NewBuffer([]byte{})
-       gzippWriter := gzipp.NewWriter(outputBuffer)
-       gzippWriter.Write(b)
-       gzippWriter.Flush()
-       gzippWriter.Close()
-       output := outputBuffer.Bytes()
-
-       m, sampleResponse := setup(output)
-
-       reqNoAcceptGzip(t, m, sampleResponse)
-
-       // This should not be gzipped even though we accept gzip
-       reqAcceptGzip(t, m, sampleResponse, false)
-}
-
-func TestMiddlewareZip(t *testing.T) {
-       b := make([]byte, MinSize*10)
-       for i := range b {
-               b[i] = byte(i % 256)
-       }
-       outputBuffer := bytes.NewBuffer([]byte{})
-       zipWriter := zip.NewWriter(outputBuffer)
-       fileWriter, err := zipWriter.Create("default")
-       assert.NoError(t, err)
-       fileWriter.Write(b)
-       //fileWriter.Close()
-       zipWriter.Close()
-       output := outputBuffer.Bytes()
-
-       m, sampleResponse := setup(output)
-
-       reqNoAcceptGzip(t, m, sampleResponse)
-
-       // This should not be gzipped even though we accept gzip
-       reqAcceptGzip(t, m, sampleResponse, false)
-}
index 48bba16bf6f96be82cebff7da8bd166f0102dd76..6de293c5072fefadb6e68489e432ae7821ee9bcc 100644 (file)
@@ -16,7 +16,6 @@ import (
        "code.gitea.io/gitea/models"
        "code.gitea.io/gitea/modules/auth"
        "code.gitea.io/gitea/modules/context"
-       "code.gitea.io/gitea/modules/gzip"
        "code.gitea.io/gitea/modules/lfs"
        "code.gitea.io/gitea/modules/log"
        "code.gitea.io/gitea/modules/metrics"
@@ -44,6 +43,7 @@ import (
        "gitea.com/macaron/captcha"
        "gitea.com/macaron/cors"
        "gitea.com/macaron/csrf"
+       "gitea.com/macaron/gzip"
        "gitea.com/macaron/i18n"
        "gitea.com/macaron/macaron"
        "gitea.com/macaron/session"
diff --git a/vendor/gitea.com/macaron/gzip/go.mod b/vendor/gitea.com/macaron/gzip/go.mod
new file mode 100644 (file)
index 0000000..e66caca
--- /dev/null
@@ -0,0 +1,9 @@
+module gitea.com/macaron/gzip
+
+go 1.12
+
+require (
+       gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
+       github.com/klauspost/compress v1.9.2
+       github.com/stretchr/testify v1.4.0
+)
diff --git a/vendor/gitea.com/macaron/gzip/go.sum b/vendor/gitea.com/macaron/gzip/go.sum
new file mode 100644 (file)
index 0000000..292be5f
--- /dev/null
@@ -0,0 +1,42 @@
+gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591 h1:UbCTjPcLrNxR9LzKDjQBMT2zoxZuEnca1pZCpgeMuhQ=
+gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
+gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
+gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
+github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
+github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e h1:JKmoR8x90Iww1ks85zJ1lfDGgIiMDuIptTOhJq+zKyg=
+github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
+github.com/jtolds/gls v4.2.1+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
+github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
+github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
+github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY=
+github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY=
+github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s=
+github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8=
+github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6xOX5DbxZEXolK+nBSvmsQwRjM=
+github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc=
+golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/ini.v1 v1.44.0 h1:YRJzTUp0kSYWUVFF5XAbDFfyiqwsl0Vb9R8TVP5eRi0=
+gopkg.in/ini.v1 v1.44.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/vendor/gitea.com/macaron/gzip/gzip.go b/vendor/gitea.com/macaron/gzip/gzip.go
new file mode 100644 (file)
index 0000000..9573d16
--- /dev/null
@@ -0,0 +1,358 @@
+// Copyright 2019 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package gzip
+
+import (
+       "bufio"
+       "fmt"
+       "io"
+       "net"
+       "net/http"
+       "regexp"
+       "strconv"
+       "strings"
+       "sync"
+
+       "gitea.com/macaron/macaron"
+       "github.com/klauspost/compress/gzip"
+)
+
+const (
+       acceptEncodingHeader  = "Accept-Encoding"
+       contentEncodingHeader = "Content-Encoding"
+       contentLengthHeader   = "Content-Length"
+       contentTypeHeader     = "Content-Type"
+       rangeHeader           = "Range"
+       varyHeader            = "Vary"
+)
+
+const (
+       // MinSize is the minimum size of content we will compress
+       MinSize = 1400
+)
+
+// noopClosers are io.Writers with a shim to prevent early closure
+type noopCloser struct {
+       io.Writer
+}
+
+func (noopCloser) Close() error { return nil }
+
+// WriterPool is a gzip writer pool to reduce workload on creation of
+// gzip writers
+type WriterPool struct {
+       pool             sync.Pool
+       compressionLevel int
+}
+
+// NewWriterPool creates a new pool
+func NewWriterPool(compressionLevel int) *WriterPool {
+       return &WriterPool{pool: sync.Pool{
+               // New will return nil, we'll manage the creation of new
+               // writers in the middleware
+               New: func() interface{} { return nil },
+       },
+               compressionLevel: compressionLevel}
+}
+
+// Get a writer from the pool - or create one if not available
+func (wp *WriterPool) Get(rw macaron.ResponseWriter) *gzip.Writer {
+       ret := wp.pool.Get()
+       if ret == nil {
+               ret, _ = gzip.NewWriterLevel(rw, wp.compressionLevel)
+       } else {
+               ret.(*gzip.Writer).Reset(rw)
+       }
+       return ret.(*gzip.Writer)
+}
+
+// Put returns a writer to the pool
+func (wp *WriterPool) Put(w *gzip.Writer) {
+       wp.pool.Put(w)
+}
+
+var writerPool WriterPool
+
+// Options represents the configuration for the gzip middleware
+type Options struct {
+       CompressionLevel int
+}
+
+func validateCompressionLevel(level int) bool {
+       return level == gzip.DefaultCompression ||
+               level == gzip.ConstantCompression ||
+               (level >= gzip.BestSpeed && level <= gzip.BestCompression)
+}
+
+func validate(options []Options) Options {
+       // Default to level 4 compression (Best results seem to be between 4 and 6)
+       opt := Options{CompressionLevel: 4}
+       if len(options) > 0 {
+               opt = options[0]
+       }
+       if !validateCompressionLevel(opt.CompressionLevel) {
+               opt.CompressionLevel = 4
+       }
+       return opt
+}
+
+// Middleware creates a macaron.Handler to proxy the response
+func Middleware(options ...Options) macaron.Handler {
+       opt := validate(options)
+       writerPool = *NewWriterPool(opt.CompressionLevel)
+       regex := regexp.MustCompile(`bytes=(\d+)\-.*`)
+
+       return func(ctx *macaron.Context) {
+               // If the client won't accept gzip or x-gzip don't compress
+               if !strings.Contains(ctx.Req.Header.Get(acceptEncodingHeader), "gzip") &&
+                       !strings.Contains(ctx.Req.Header.Get(acceptEncodingHeader), "x-gzip") {
+                       return
+               }
+
+               // If the client is asking for a specific range of bytes - don't compress
+               if rangeHdr := ctx.Req.Header.Get(rangeHeader); rangeHdr != "" {
+
+                       match := regex.FindStringSubmatch(rangeHdr)
+                       if len(match) > 1 {
+                               return
+                       }
+               }
+
+               // OK we should proxy the response writer
+               // We are still not necessarily going to compress...
+               proxyWriter := &ProxyResponseWriter{
+                       internal: ctx.Resp,
+               }
+               defer proxyWriter.Close()
+
+               ctx.Resp = proxyWriter
+               ctx.MapTo(proxyWriter, (*http.ResponseWriter)(nil))
+
+               // Check if render middleware has been registered,
+               // if yes, we need to modify ResponseWriter for it as well.
+               if _, ok := ctx.Render.(*macaron.DummyRender); !ok {
+                       ctx.Render.SetResponseWriter(proxyWriter)
+               }
+
+               ctx.Next()
+               ctx.Resp = proxyWriter.internal
+       }
+}
+
+// ProxyResponseWriter is a wrapped macaron ResponseWriter that may compress its contents
+type ProxyResponseWriter struct {
+       writer   io.WriteCloser
+       internal macaron.ResponseWriter
+       stopped  bool
+
+       code int
+       buf  []byte
+}
+
+// Header returns the header map
+func (proxy *ProxyResponseWriter) Header() http.Header {
+       return proxy.internal.Header()
+}
+
+// Status returns the status code of the response or 0 if the response has not been written.
+func (proxy *ProxyResponseWriter) Status() int {
+       if proxy.code != 0 {
+               return proxy.code
+       }
+       return proxy.internal.Status()
+}
+
+// Written returns whether or not the ResponseWriter has been written.
+func (proxy *ProxyResponseWriter) Written() bool {
+       if proxy.code != 0 {
+               return true
+       }
+       return proxy.internal.Written()
+}
+
+// Size returns the size of the response body.
+func (proxy *ProxyResponseWriter) Size() int {
+       return proxy.internal.Size()
+}
+
+// Before allows for a function to be called before the ResponseWriter has been written to. This is
+// useful for setting headers or any other operations that must happen before a response has been written.
+func (proxy *ProxyResponseWriter) Before(before macaron.BeforeFunc) {
+       proxy.internal.Before(before)
+}
+
+// Write appends data to the proxied gzip writer.
+func (proxy *ProxyResponseWriter) Write(b []byte) (int, error) {
+       // if writer is initialized, use the writer
+       if proxy.writer != nil {
+               return proxy.writer.Write(b)
+       }
+
+       proxy.buf = append(proxy.buf, b...)
+
+       var (
+               contentLength, _ = strconv.Atoi(proxy.Header().Get(contentLengthHeader))
+               contentType      = proxy.Header().Get(contentTypeHeader)
+               contentEncoding  = proxy.Header().Get(contentEncodingHeader)
+       )
+
+       // OK if an encoding hasn't been chosen, and content length > 1400
+       // and content type isn't a compressed type
+       if contentEncoding == "" &&
+               (contentLength == 0 || contentLength >= MinSize) &&
+               (contentType == "" || !compressedContentType(contentType)) {
+               // If current buffer is less than the min size and a Content-Length isn't set, then wait
+               if len(proxy.buf) < MinSize && contentLength == 0 {
+                       return len(b), nil
+               }
+
+               // If the Content-Length is larger than minSize or the current buffer is larger than minSize, then continue.
+               if contentLength >= MinSize || len(proxy.buf) >= MinSize {
+                       // if we don't know the content type, infer it
+                       if contentType == "" {
+                               contentType = http.DetectContentType(proxy.buf)
+                               proxy.Header().Set(contentTypeHeader, contentType)
+                       }
+                       // If the Content-Type is not compressed - Compress!
+                       if !compressedContentType(contentType) {
+                               if err := proxy.startGzip(); err != nil {
+                                       return 0, err
+                               }
+                               return len(b), nil
+                       }
+               }
+       }
+       // If we got here, we should not GZIP this response.
+       if err := proxy.startPlain(); err != nil {
+               return 0, err
+       }
+       return len(b), nil
+}
+
+func (proxy *ProxyResponseWriter) startGzip() error {
+       // Set the content-encoding and vary headers.
+       proxy.Header().Set(contentEncodingHeader, "gzip")
+       proxy.Header().Set(varyHeader, acceptEncodingHeader)
+
+       // if the Content-Length is already set, then calls to Write on gzip
+       // will fail to set the Content-Length header since its already set
+       // See: https://github.com/golang/go/issues/14975.
+       proxy.Header().Del(contentLengthHeader)
+
+       // Write the header to gzip response.
+       if proxy.code != 0 {
+               proxy.internal.WriteHeader(proxy.code)
+               // Ensure that no other WriteHeader's happen
+               proxy.code = 0
+       }
+
+       // Initialize and flush the buffer into the gzip response if there are any bytes.
+       // If there aren't any, we shouldn't initialize it yet because on Close it will
+       // write the gzip header even if nothing was ever written.
+       if len(proxy.buf) > 0 {
+               // Initialize the GZIP response.
+               proxy.writer = writerPool.Get(proxy.internal)
+
+               return proxy.writeBuf()
+       }
+       return nil
+}
+
+func (proxy *ProxyResponseWriter) startPlain() error {
+       if proxy.code != 0 {
+               proxy.internal.WriteHeader(proxy.code)
+               proxy.code = 0
+       }
+       proxy.stopped = true
+       proxy.writer = noopCloser{proxy.internal}
+       return proxy.writeBuf()
+}
+
+func (proxy *ProxyResponseWriter) writeBuf() error {
+       if proxy.buf == nil {
+               return nil
+       }
+
+       n, err := proxy.writer.Write(proxy.buf)
+
+       // This should never happen (per io.Writer docs), but if the write didn't
+       // accept the entire buffer but returned no specific error, we have no clue
+       // what's going on, so abort just to be safe.
+       if err == nil && n < len(proxy.buf) {
+               err = io.ErrShortWrite
+       }
+       proxy.buf = nil
+       return err
+}
+
+// WriteHeader will ensure that we have setup the writer before we write the header
+func (proxy *ProxyResponseWriter) WriteHeader(code int) {
+       if proxy.code == 0 {
+               proxy.code = code
+       }
+}
+
+// Close the writer
+func (proxy *ProxyResponseWriter) Close() error {
+       if proxy.stopped {
+               return nil
+       }
+
+       if proxy.writer == nil {
+               err := proxy.startPlain()
+               if err != nil {
+                       return fmt.Errorf("GzipMiddleware: write to regular responseWriter at close gets error: %q", err.Error())
+               }
+       }
+
+       err := proxy.writer.Close()
+
+       if poolWriter, ok := proxy.writer.(*gzip.Writer); ok {
+               writerPool.Put(poolWriter)
+       }
+
+       proxy.writer = nil
+       proxy.stopped = true
+       return err
+}
+
+// Flush the writer
+func (proxy *ProxyResponseWriter) Flush() {
+       if proxy.writer == nil {
+               return
+       }
+
+       if gw, ok := proxy.writer.(*gzip.Writer); ok {
+               gw.Flush()
+       }
+
+       proxy.internal.Flush()
+}
+
+// Hijack implements http.Hijacker. If the underlying ResponseWriter is a
+// Hijacker, its Hijack method is returned. Otherwise an error is returned.
+func (proxy *ProxyResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
+       hijacker, ok := proxy.internal.(http.Hijacker)
+       if !ok {
+               return nil, nil, fmt.Errorf("the ResponseWriter doesn't support the Hijacker interface")
+       }
+       return hijacker.Hijack()
+}
+
+// verify Hijacker interface implementation
+var _ http.Hijacker = &ProxyResponseWriter{}
+
+func compressedContentType(contentType string) bool {
+       switch contentType {
+       case "application/zip":
+               return true
+       case "application/x-gzip":
+               return true
+       case "application/gzip":
+               return true
+       default:
+               return false
+       }
+}
index 74487567632c8f137ef3971b0f5912ca50bebcda..1eb75ef68e448f6a726e601ceef9772aef33cd40 100644 (file)
@@ -1,4 +1,5 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/vendor/github.com/klauspost/compress/flate/copy.go b/vendor/github.com/klauspost/compress/flate/copy.go
deleted file mode 100644 (file)
index a3200a8..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// forwardCopy is like the built-in copy function except that it always goes
-// forward from the start, even if the dst and src overlap.
-// It is equivalent to:
-//   for i := 0; i < n; i++ {
-//     mem[dst+i] = mem[src+i]
-//   }
-func forwardCopy(mem []byte, dst, src, n int) {
-       if dst <= src {
-               copy(mem[dst:dst+n], mem[src:src+n])
-               return
-       }
-       for {
-               if dst >= src+n {
-                       copy(mem[dst:dst+n], mem[src:src+n])
-                       return
-               }
-               // There is some forward overlap.  The destination
-               // will be filled with a repeated pattern of mem[src:src+k].
-               // We copy one instance of the pattern here, then repeat.
-               // Each time around this loop k will double.
-               k := dst - src
-               copy(mem[dst:dst+k], mem[src:src+k])
-               n -= k
-               dst += k
-       }
-}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
deleted file mode 100644 (file)
index 70a6095..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-import (
-       "github.com/klauspost/cpuid"
-)
-
-// crc32sse returns a hash for the first 4 bytes of the slice
-// len(a) must be >= 4.
-//go:noescape
-func crc32sse(a []byte) uint32
-
-// crc32sseAll calculates hashes for each 4-byte set in a.
-// dst must be east len(a) - 4 in size.
-// The size is not checked by the assembly.
-//go:noescape
-func crc32sseAll(a []byte, dst []uint32)
-
-// matchLenSSE4 returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-//
-// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
-//
-//go:noescape
-func matchLenSSE4(a, b []byte, max int) int
-
-// histogram accumulates a histogram of b in h.
-// h must be at least 256 entries in length,
-// and must be cleared before calling this function.
-//go:noescape
-func histogram(b []byte, h []int32)
-
-// Detect SSE 4.2 feature.
-func init() {
-       useSSE42 = cpuid.CPU.SSE42()
-}
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
deleted file mode 100644 (file)
index 2fb2079..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-// func crc32sse(a []byte) uint32
-TEXT ·crc32sse(SB), 4, $0
-       MOVQ a+0(FP), R10
-       XORQ BX, BX
-
-       // CRC32   dword (R10), EBX
-       BYTE $0xF2; BYTE $0x41; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0x1a
-
-       MOVL BX, ret+24(FP)
-       RET
-
-// func crc32sseAll(a []byte, dst []uint32)
-TEXT ·crc32sseAll(SB), 4, $0
-       MOVQ  a+0(FP), R8      // R8: src
-       MOVQ  a_len+8(FP), R10 // input length
-       MOVQ  dst+24(FP), R9   // R9: dst
-       SUBQ  $4, R10
-       JS    end
-       JZ    one_crc
-       MOVQ  R10, R13
-       SHRQ  $2, R10          // len/4
-       ANDQ  $3, R13          // len&3
-       XORQ  BX, BX
-       ADDQ  $1, R13
-       TESTQ R10, R10
-       JZ    rem_loop
-
-crc_loop:
-       MOVQ (R8), R11
-       XORQ BX, BX
-       XORQ DX, DX
-       XORQ DI, DI
-       MOVQ R11, R12
-       SHRQ $8, R11
-       MOVQ R12, AX
-       MOVQ R11, CX
-       SHRQ $16, R12
-       SHRQ $16, R11
-       MOVQ R12, SI
-
-       // CRC32   EAX, EBX
-       BYTE $0xF2; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-       // CRC32   ECX, EDX
-       BYTE $0xF2; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0xd1
-
-       // CRC32   ESI, EDI
-       BYTE $0xF2; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0xfe
-       MOVL BX, (R9)
-       MOVL DX, 4(R9)
-       MOVL DI, 8(R9)
-
-       XORQ BX, BX
-       MOVL R11, AX
-
-       // CRC32   EAX, EBX
-       BYTE $0xF2; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-       MOVL BX, 12(R9)
-
-       ADDQ $16, R9
-       ADDQ $4, R8
-       XORQ BX, BX
-       SUBQ $1, R10
-       JNZ  crc_loop
-
-rem_loop:
-       MOVL (R8), AX
-
-       // CRC32   EAX, EBX
-       BYTE $0xF2; BYTE $0x0f
-       BYTE $0x38; BYTE $0xf1; BYTE $0xd8
-
-       MOVL BX, (R9)
-       ADDQ $4, R9
-       ADDQ $1, R8
-       XORQ BX, BX
-       SUBQ $1, R13
-       JNZ  rem_loop
-
-end:
-       RET
-
-one_crc:
-       MOVQ $1, R13
-       XORQ BX, BX
-       JMP  rem_loop
-
-// func matchLenSSE4(a, b []byte, max int) int
-TEXT ·matchLenSSE4(SB), 4, $0
-       MOVQ a_base+0(FP), SI
-       MOVQ b_base+24(FP), DI
-       MOVQ DI, DX
-       MOVQ max+48(FP), CX
-
-cmp8:
-       // As long as we are 8 or more bytes before the end of max, we can load and
-       // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-       CMPQ CX, $8
-       JLT  cmp1
-       MOVQ (SI), AX
-       MOVQ (DI), BX
-       CMPQ AX, BX
-       JNE  bsf
-       ADDQ $8, SI
-       ADDQ $8, DI
-       SUBQ $8, CX
-       JMP  cmp8
-
-bsf:
-       // If those 8 bytes were not equal, XOR the two 8 byte values, and return
-       // the index of the first byte that differs. The BSF instruction finds the
-       // least significant 1 bit, the amd64 architecture is little-endian, and
-       // the shift by 3 converts a bit index to a byte index.
-       XORQ AX, BX
-       BSFQ BX, BX
-       SHRQ $3, BX
-       ADDQ BX, DI
-
-       // Subtract off &b[0] to convert from &b[ret] to ret, and return.
-       SUBQ DX, DI
-       MOVQ DI, ret+56(FP)
-       RET
-
-cmp1:
-       // In the slices' tail, compare 1 byte at a time.
-       CMPQ CX, $0
-       JEQ  matchLenEnd
-       MOVB (SI), AX
-       MOVB (DI), BX
-       CMPB AX, BX
-       JNE  matchLenEnd
-       ADDQ $1, SI
-       ADDQ $1, DI
-       SUBQ $1, CX
-       JMP  cmp1
-
-matchLenEnd:
-       // Subtract off &b[0] to convert from &b[ret] to ret, and return.
-       SUBQ DX, DI
-       MOVQ DI, ret+56(FP)
-       RET
-
-// func histogram(b []byte, h []int32)
-TEXT ·histogram(SB), 4, $0
-       MOVQ b+0(FP), SI     // SI: &b
-       MOVQ b_len+8(FP), R9 // R9: len(b)
-       MOVQ h+24(FP), DI    // DI: Histogram
-       MOVQ R9, R8
-       SHRQ $3, R8
-       JZ   hist1
-       XORQ R11, R11
-
-loop_hist8:
-       MOVQ (SI), R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       MOVB R10, R11
-       INCL (DI)(R11*4)
-       SHRQ $8, R10
-
-       INCL (DI)(R10*4)
-
-       ADDQ $8, SI
-       DECQ R8
-       JNZ  loop_hist8
-
-hist1:
-       ANDQ $7, R9
-       JZ   end_hist
-       XORQ R10, R10
-
-loop_hist1:
-       MOVB (SI), R10
-       INCL (DI)(R10*4)
-       INCQ SI
-       DECQ R9
-       JNZ  loop_hist1
-
-end_hist:
-       RET
diff --git a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
deleted file mode 100644 (file)
index bd98bd5..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-//+build !amd64 noasm appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package flate
-
-func init() {
-       useSSE42 = false
-}
-
-// crc32sse should never be called.
-func crc32sse(a []byte) uint32 {
-       panic("no assembler")
-}
-
-// crc32sseAll should never be called.
-func crc32sseAll(a []byte, dst []uint32) {
-       panic("no assembler")
-}
-
-// matchLenSSE4 should never be called.
-func matchLenSSE4(a, b []byte, max int) int {
-       panic("no assembler")
-       return 0
-}
-
-// histogram accumulates a histogram of b in h.
-//
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogram(b []byte, h []int32) {
-       h = h[:256]
-       for _, t := range b {
-               h[t]++
-       }
-}
index 76e9682f7e96a864542ac228f4ca1d7615aa0055..20c94f596843901f085246cbb8c30bc7607b9dbb 100644 (file)
@@ -50,8 +50,6 @@ const (
        skipNever = math.MaxInt32
 )
 
-var useSSE42 bool
-
 type compressionLevel struct {
        good, lazy, nice, chain, fastSkipHashing, level int
 }
@@ -77,16 +75,14 @@ var levels = []compressionLevel{
        {32, 258, 258, 4096, skipNever, 9},
 }
 
-type compressor struct {
-       compressionLevel
-
-       w          *huffmanBitWriter
-       bulkHasher func([]byte, []uint32)
-
-       // compression algorithm
-       fill func(*compressor, []byte) int // copy data to window
-       step func(*compressor)             // process window
-       sync bool                          // requesting flush
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+       // deflate state
+       length         int
+       offset         int
+       hash           uint32
+       maxInsertIndex int
+       ii             uint16 // position of last match, intended to overflow to reset.
 
        // Input hash chains
        // hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -99,55 +95,63 @@ type compressor struct {
        hashOffset int
 
        // input window: unprocessed data is window[index:windowEnd]
-       index         int
+       index     int
+       hashMatch [maxMatchLength + minMatchLength]uint32
+}
+
+type compressor struct {
+       compressionLevel
+
+       w *huffmanBitWriter
+
+       // compression algorithm
+       fill func(*compressor, []byte) int // copy data to window
+       step func(*compressor)             // process window
+       sync bool                          // requesting flush
+
        window        []byte
        windowEnd     int
        blockStart    int  // window index where current tokens start
        byteAvailable bool // if true, still need to process window[index-1].
+       err           error
 
        // queued output tokens
        tokens tokens
-
-       // deflate state
-       length         int
-       offset         int
-       hash           uint32
-       maxInsertIndex int
-       err            error
-       ii             uint16 // position of last match, intended to overflow to reset.
-
-       snap      snappyEnc
-       hashMatch [maxMatchLength + minMatchLength]uint32
+       fast   fastEnc
+       state  *advancedState
 }
 
 func (d *compressor) fillDeflate(b []byte) int {
-       if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+       s := d.state
+       if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
                // shift the window by windowSize
                copy(d.window[:], d.window[windowSize:2*windowSize])
-               d.index -= windowSize
+               s.index -= windowSize
                d.windowEnd -= windowSize
                if d.blockStart >= windowSize {
                        d.blockStart -= windowSize
                } else {
                        d.blockStart = math.MaxInt32
                }
-               d.hashOffset += windowSize
-               if d.hashOffset > maxHashOffset {
-                       delta := d.hashOffset - 1
-                       d.hashOffset -= delta
-                       d.chainHead -= delta
-                       for i, v := range d.hashPrev {
+               s.hashOffset += windowSize
+               if s.hashOffset > maxHashOffset {
+                       delta := s.hashOffset - 1
+                       s.hashOffset -= delta
+                       s.chainHead -= delta
+                       // Iterate over slices instead of arrays to avoid copying
+                       // the entire table onto the stack (Issue #18625).
+                       for i, v := range s.hashPrev[:] {
                                if int(v) > delta {
-                                       d.hashPrev[i] = uint32(int(v) - delta)
+                                       s.hashPrev[i] = uint32(int(v) - delta)
                                } else {
-                                       d.hashPrev[i] = 0
+                                       s.hashPrev[i] = 0
                                }
                        }
-                       for i, v := range d.hashHead {
+                       for i, v := range s.hashHead[:] {
                                if int(v) > delta {
-                                       d.hashHead[i] = uint32(int(v) - delta)
+                                       s.hashHead[i] = uint32(int(v) - delta)
                                } else {
-                                       d.hashHead[i] = 0
+                                       s.hashHead[i] = 0
                                }
                        }
                }
@@ -157,14 +161,14 @@ func (d *compressor) fillDeflate(b []byte) int {
        return n
 }
 
-func (d *compressor) writeBlock(tok tokens, index int, eof bool) error {
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
        if index > 0 || eof {
                var window []byte
                if d.blockStart <= index {
                        window = d.window[d.blockStart:index]
                }
                d.blockStart = index
-               d.w.writeBlock(tok.tokens[:tok.n], eof, window)
+               d.w.writeBlock(tok, eof, window)
                return d.w.err
        }
        return nil
@@ -173,20 +177,20 @@ func (d *compressor) writeBlock(tok tokens, index int, eof bool) error {
 // writeBlockSkip writes the current block and uses the number of tokens
 // to determine if the block should be stored on no matches, or
 // only huffman encoded.
-func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error {
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
        if index > 0 || eof {
                if d.blockStart <= index {
                        window := d.window[d.blockStart:index]
                        // If we removed less than a 64th of all literals
                        // we huffman compress the block.
                        if int(tok.n) > len(window)-int(tok.n>>6) {
-                               d.w.writeBlockHuff(eof, window)
+                               d.w.writeBlockHuff(eof, window, d.sync)
                        } else {
                                // Write a dynamic huffman block.
-                               d.w.writeBlockDynamic(tok.tokens[:tok.n], eof, window)
+                               d.w.writeBlockDynamic(tok, eof, window, d.sync)
                        }
                } else {
-                       d.w.writeBlock(tok.tokens[:tok.n], eof, nil)
+                       d.w.writeBlock(tok, eof, nil)
                }
                d.blockStart = index
                return d.w.err
@@ -201,10 +205,19 @@ func (d *compressor) writeBlockSkip(tok tokens, index int, eof bool) error {
 func (d *compressor) fillWindow(b []byte) {
        // Do not fill window if we are in store-only mode,
        // use constant or Snappy compression.
-       switch d.compressionLevel.level {
-       case 0, 1, 2:
+       if d.level == 0 {
+               return
+       }
+       if d.fast != nil {
+               // encode the last data, but discard the result
+               if len(b) > maxMatchOffset {
+                       b = b[len(b)-maxMatchOffset:]
+               }
+               d.fast.Encode(&d.tokens, b)
+               d.tokens.Reset()
                return
        }
+       s := d.state
        // If we are given too much, cut it.
        if len(b) > windowSize {
                b = b[len(b)-windowSize:]
@@ -227,28 +240,28 @@ func (d *compressor) fillWindow(b []byte) {
                        continue
                }
 
-               dst := d.hashMatch[:dstSize]
-               d.bulkHasher(tocheck, dst)
+               dst := s.hashMatch[:dstSize]
+               bulkHash4(tocheck, dst)
                var newH uint32
                for i, val := range dst {
                        di := i + startindex
                        newH = val & hashMask
                        // Get previous value with the same hash.
                        // Our chain should point to the previous value.
-                       d.hashPrev[di&windowMask] = d.hashHead[newH]
+                       s.hashPrev[di&windowMask] = s.hashHead[newH]
                        // Set the head of the hash chain to us.
-                       d.hashHead[newH] = uint32(di + d.hashOffset)
+                       s.hashHead[newH] = uint32(di + s.hashOffset)
                }
-               d.hash = newH
+               s.hash = newH
        }
        // Update window information.
        d.windowEnd += n
-       d.index = n
+       s.index = n
 }
 
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
-// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
+// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
 func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
        minMatchLook := maxMatchLength
        if lookahead < minMatchLook {
@@ -276,7 +289,7 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 
        for i := prevHead; tries > 0; tries-- {
                if wEnd == win[i+length] {
-                       n := matchLen(win[i:], wPos, minMatchLook)
+                       n := matchLen(win[i:i+minMatchLook], wPos)
 
                        if n > length && (n > minMatchLength || pos-i <= 4096) {
                                length = n
@@ -293,62 +306,7 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
                        // hashPrev[i & windowMask] has already been overwritten, so stop now.
                        break
                }
-               i = int(d.hashPrev[i&windowMask]) - d.hashOffset
-               if i < minIndex || i < 0 {
-                       break
-               }
-       }
-       return
-}
-
-// Try to find a match starting at index whose length is greater than prevSize.
-// We only look at chainCount possibilities before giving up.
-// pos = d.index, prevHead = d.chainHead-d.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatchSSE(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
-       minMatchLook := maxMatchLength
-       if lookahead < minMatchLook {
-               minMatchLook = lookahead
-       }
-
-       win := d.window[0 : pos+minMatchLook]
-
-       // We quit when we get a match that's at least nice long
-       nice := len(win) - pos
-       if d.nice < nice {
-               nice = d.nice
-       }
-
-       // If we've got a match that's good enough, only look in 1/4 the chain.
-       tries := d.chain
-       length = prevLength
-       if length >= d.good {
-               tries >>= 2
-       }
-
-       wEnd := win[pos+length]
-       wPos := win[pos:]
-       minIndex := pos - windowSize
-
-       for i := prevHead; tries > 0; tries-- {
-               if wEnd == win[i+length] {
-                       n := matchLenSSE4(win[i:], wPos, minMatchLook)
-
-                       if n > length && (n > minMatchLength || pos-i <= 4096) {
-                               length = n
-                               offset = pos - i
-                               ok = true
-                               if n >= nice {
-                                       // The match is good enough that we don't try to find a better one.
-                                       break
-                               }
-                               wEnd = win[pos+n]
-                       }
-               }
-               if i == minIndex {
-                       // hashPrev[i & windowMask] has already been overwritten, so stop now.
-                       break
-               }
-               i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+               i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
                if i < minIndex || i < 0 {
                        break
                }
@@ -364,293 +322,139 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
        return d.w.err
 }
 
-const hashmul = 0x1e35a7bd
-
 // hash4 returns a hash representation of the first 4 bytes
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-       return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+       b = b[:4]
+       return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
 }
 
 // bulkHash4 will compute hashes using the same
 // algorithm as hash4
 func bulkHash4(b []byte, dst []uint32) {
-       if len(b) < minMatchLength {
+       if len(b) < 4 {
                return
        }
        hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
-       dst[0] = (hb * hashmul) >> (32 - hashBits)
-       end := len(b) - minMatchLength + 1
+       dst[0] = hash4u(hb, hashBits)
+       end := len(b) - 4 + 1
        for i := 1; i < end; i++ {
                hb = (hb << 8) | uint32(b[i+3])
-               dst[i] = (hb * hashmul) >> (32 - hashBits)
+               dst[i] = hash4u(hb, hashBits)
        }
 }
 
-// matchLen returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-func matchLen(a, b []byte, max int) int {
-       a = a[:max]
-       b = b[:len(a)]
-       for i, av := range a {
-               if b[i] != av {
-                       return i
-               }
-       }
-       return max
-}
-
 func (d *compressor) initDeflate() {
        d.window = make([]byte, 2*windowSize)
-       d.hashOffset = 1
-       d.length = minMatchLength - 1
-       d.offset = 0
        d.byteAvailable = false
-       d.index = 0
-       d.hash = 0
-       d.chainHead = -1
-       d.bulkHasher = bulkHash4
-       if useSSE42 {
-               d.bulkHasher = crc32sseAll
-       }
-}
-
-// Assumes that d.fastSkipHashing != skipNever,
-// otherwise use deflateLazy
-func (d *compressor) deflate() {
-
-       // Sanity enables additional runtime tests.
-       // It's intended to be used during development
-       // to supplement the currently ad-hoc unit tests.
-       const sanity = false
-
-       if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+       d.err = nil
+       if d.state == nil {
                return
        }
-
-       d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-       if d.index < d.maxInsertIndex {
-               d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-       }
-
-       for {
-               if sanity && d.index > d.windowEnd {
-                       panic("index > windowEnd")
-               }
-               lookahead := d.windowEnd - d.index
-               if lookahead < minMatchLength+maxMatchLength {
-                       if !d.sync {
-                               return
-                       }
-                       if sanity && d.index > d.windowEnd {
-                               panic("index > windowEnd")
-                       }
-                       if lookahead == 0 {
-                               if d.tokens.n > 0 {
-                                       if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                               return
-                       }
-               }
-               if d.index < d.maxInsertIndex {
-                       // Update the hash
-                       d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-                       ch := d.hashHead[d.hash&hashMask]
-                       d.chainHead = int(ch)
-                       d.hashPrev[d.index&windowMask] = ch
-                       d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
-               }
-               d.length = minMatchLength - 1
-               d.offset = 0
-               minIndex := d.index - windowSize
-               if minIndex < 0 {
-                       minIndex = 0
-               }
-
-               if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
-                       if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-                               d.length = newLength
-                               d.offset = newOffset
-                       }
-               }
-               if d.length >= minMatchLength {
-                       d.ii = 0
-                       // There was a match at the previous step, and the current match is
-                       // not better. Output the previous match.
-                       // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
-                       d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
-                       d.tokens.n++
-                       // Insert in the hash table all strings up to the end of the match.
-                       // index and index-1 are already inserted. If there is not enough
-                       // lookahead, the last two strings are not inserted into the hash
-                       // table.
-                       if d.length <= d.fastSkipHashing {
-                               var newIndex int
-                               newIndex = d.index + d.length
-                               // Calculate missing hashes
-                               end := newIndex
-                               if end > d.maxInsertIndex {
-                                       end = d.maxInsertIndex
-                               }
-                               end += minMatchLength - 1
-                               startindex := d.index + 1
-                               if startindex > d.maxInsertIndex {
-                                       startindex = d.maxInsertIndex
-                               }
-                               tocheck := d.window[startindex:end]
-                               dstSize := len(tocheck) - minMatchLength + 1
-                               if dstSize > 0 {
-                                       dst := d.hashMatch[:dstSize]
-                                       bulkHash4(tocheck, dst)
-                                       var newH uint32
-                                       for i, val := range dst {
-                                               di := i + startindex
-                                               newH = val & hashMask
-                                               // Get previous value with the same hash.
-                                               // Our chain should point to the previous value.
-                                               d.hashPrev[di&windowMask] = d.hashHead[newH]
-                                               // Set the head of the hash chain to us.
-                                               d.hashHead[newH] = uint32(di + d.hashOffset)
-                                       }
-                                       d.hash = newH
-                               }
-                               d.index = newIndex
-                       } else {
-                               // For matches this long, we don't bother inserting each individual
-                               // item into the table.
-                               d.index += d.length
-                               if d.index < d.maxInsertIndex {
-                                       d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-                               }
-                       }
-                       if d.tokens.n == maxFlateBlockTokens {
-                               // The block includes the current character
-                               if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-                                       return
-                               }
-                               d.tokens.n = 0
-                       }
-               } else {
-                       d.ii++
-                       end := d.index + int(d.ii>>uint(d.fastSkipHashing)) + 1
-                       if end > d.windowEnd {
-                               end = d.windowEnd
-                       }
-                       for i := d.index; i < end; i++ {
-                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
-                               d.tokens.n++
-                               if d.tokens.n == maxFlateBlockTokens {
-                                       if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                       }
-                       d.index = end
-               }
-       }
+       s := d.state
+       s.index = 0
+       s.hashOffset = 1
+       s.length = minMatchLength - 1
+       s.offset = 0
+       s.hash = 0
+       s.chainHead = -1
 }
 
 // deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
 // meaning it always has lazy matching on.
 func (d *compressor) deflateLazy() {
+       s := d.state
        // Sanity enables additional runtime tests.
        // It's intended to be used during development
        // to supplement the currently ad-hoc unit tests.
        const sanity = false
 
-       if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+       if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
                return
        }
 
-       d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-       if d.index < d.maxInsertIndex {
-               d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+       s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+       if s.index < s.maxInsertIndex {
+               s.hash = hash4(d.window[s.index : s.index+minMatchLength])
        }
 
        for {
-               if sanity && d.index > d.windowEnd {
+               if sanity && s.index > d.windowEnd {
                        panic("index > windowEnd")
                }
-               lookahead := d.windowEnd - d.index
+               lookahead := d.windowEnd - s.index
                if lookahead < minMatchLength+maxMatchLength {
                        if !d.sync {
                                return
                        }
-                       if sanity && d.index > d.windowEnd {
+                       if sanity && s.index > d.windowEnd {
                                panic("index > windowEnd")
                        }
                        if lookahead == 0 {
                                // Flush current output block if any.
                                if d.byteAvailable {
                                        // There is still one pending token that needs to be flushed
-                                       d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                       d.tokens.n++
+                                       d.tokens.AddLiteral(d.window[s.index-1])
                                        d.byteAvailable = false
                                }
                                if d.tokens.n > 0 {
-                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                                       if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
                                                return
                                        }
-                                       d.tokens.n = 0
+                                       d.tokens.Reset()
                                }
                                return
                        }
                }
-               if d.index < d.maxInsertIndex {
+               if s.index < s.maxInsertIndex {
                        // Update the hash
-                       d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-                       ch := d.hashHead[d.hash&hashMask]
-                       d.chainHead = int(ch)
-                       d.hashPrev[d.index&windowMask] = ch
-                       d.hashHead[d.hash&hashMask] = uint32(d.index + d.hashOffset)
-               }
-               prevLength := d.length
-               prevOffset := d.offset
-               d.length = minMatchLength - 1
-               d.offset = 0
-               minIndex := d.index - windowSize
+                       s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+                       ch := s.hashHead[s.hash&hashMask]
+                       s.chainHead = int(ch)
+                       s.hashPrev[s.index&windowMask] = ch
+                       s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset)
+               }
+               prevLength := s.length
+               prevOffset := s.offset
+               s.length = minMatchLength - 1
+               s.offset = 0
+               minIndex := s.index - windowSize
                if minIndex < 0 {
                        minIndex = 0
                }
 
-               if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-                       if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-                               d.length = newLength
-                               d.offset = newOffset
+               if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+                       if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+                               s.length = newLength
+                               s.offset = newOffset
                        }
                }
-               if prevLength >= minMatchLength && d.length <= prevLength {
+               if prevLength >= minMatchLength && s.length <= prevLength {
                        // There was a match at the previous step, and the current match is
                        // not better. Output the previous match.
-                       d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
-                       d.tokens.n++
+                       d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
 
                        // Insert in the hash table all strings up to the end of the match.
                        // index and index-1 are already inserted. If there is not enough
                        // lookahead, the last two strings are not inserted into the hash
                        // table.
                        var newIndex int
-                       newIndex = d.index + prevLength - 1
+                       newIndex = s.index + prevLength - 1
                        // Calculate missing hashes
                        end := newIndex
-                       if end > d.maxInsertIndex {
-                               end = d.maxInsertIndex
+                       if end > s.maxInsertIndex {
+                               end = s.maxInsertIndex
                        }
                        end += minMatchLength - 1
-                       startindex := d.index + 1
-                       if startindex > d.maxInsertIndex {
-                               startindex = d.maxInsertIndex
+                       startindex := s.index + 1
+                       if startindex > s.maxInsertIndex {
+                               startindex = s.maxInsertIndex
                        }
                        tocheck := d.window[startindex:end]
                        dstSize := len(tocheck) - minMatchLength + 1
                        if dstSize > 0 {
-                               dst := d.hashMatch[:dstSize]
+                               dst := s.hashMatch[:dstSize]
                                bulkHash4(tocheck, dst)
                                var newH uint32
                                for i, val := range dst {
@@ -658,390 +462,71 @@ func (d *compressor) deflateLazy() {
                                        newH = val & hashMask
                                        // Get previous value with the same hash.
                                        // Our chain should point to the previous value.
-                                       d.hashPrev[di&windowMask] = d.hashHead[newH]
-                                       // Set the head of the hash chain to us.
-                                       d.hashHead[newH] = uint32(di + d.hashOffset)
-                               }
-                               d.hash = newH
-                       }
-
-                       d.index = newIndex
-                       d.byteAvailable = false
-                       d.length = minMatchLength - 1
-                       if d.tokens.n == maxFlateBlockTokens {
-                               // The block includes the current character
-                               if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-                                       return
-                               }
-                               d.tokens.n = 0
-                       }
-               } else {
-                       // Reset, if we got a match this run.
-                       if d.length >= minMatchLength {
-                               d.ii = 0
-                       }
-                       // We have a byte waiting. Emit it.
-                       if d.byteAvailable {
-                               d.ii++
-                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                               d.tokens.n++
-                               if d.tokens.n == maxFlateBlockTokens {
-                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                               d.index++
-
-                               // If we have a long run of no matches, skip additional bytes
-                               // Resets when d.ii overflows after 64KB.
-                               if d.ii > 31 {
-                                       n := int(d.ii >> 5)
-                                       for j := 0; j < n; j++ {
-                                               if d.index >= d.windowEnd-1 {
-                                                       break
-                                               }
-
-                                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                               d.tokens.n++
-                                               if d.tokens.n == maxFlateBlockTokens {
-                                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-                                                               return
-                                                       }
-                                                       d.tokens.n = 0
-                                               }
-                                               d.index++
-                                       }
-                                       // Flush last byte
-                                       d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                       d.tokens.n++
-                                       d.byteAvailable = false
-                                       // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
-                                       if d.tokens.n == maxFlateBlockTokens {
-                                               if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-                                                       return
-                                               }
-                                               d.tokens.n = 0
-                                       }
-                               }
-                       } else {
-                               d.index++
-                               d.byteAvailable = true
-                       }
-               }
-       }
-}
-
-// Assumes that d.fastSkipHashing != skipNever,
-// otherwise use deflateLazySSE
-func (d *compressor) deflateSSE() {
-
-       // Sanity enables additional runtime tests.
-       // It's intended to be used during development
-       // to supplement the currently ad-hoc unit tests.
-       const sanity = false
-
-       if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-               return
-       }
-
-       d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-       if d.index < d.maxInsertIndex {
-               d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-       }
-
-       for {
-               if sanity && d.index > d.windowEnd {
-                       panic("index > windowEnd")
-               }
-               lookahead := d.windowEnd - d.index
-               if lookahead < minMatchLength+maxMatchLength {
-                       if !d.sync {
-                               return
-                       }
-                       if sanity && d.index > d.windowEnd {
-                               panic("index > windowEnd")
-                       }
-                       if lookahead == 0 {
-                               if d.tokens.n > 0 {
-                                       if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                               return
-                       }
-               }
-               if d.index < d.maxInsertIndex {
-                       // Update the hash
-                       d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-                       ch := d.hashHead[d.hash]
-                       d.chainHead = int(ch)
-                       d.hashPrev[d.index&windowMask] = ch
-                       d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
-               }
-               d.length = minMatchLength - 1
-               d.offset = 0
-               minIndex := d.index - windowSize
-               if minIndex < 0 {
-                       minIndex = 0
-               }
-
-               if d.chainHead-d.hashOffset >= minIndex && lookahead > minMatchLength-1 {
-                       if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-                               d.length = newLength
-                               d.offset = newOffset
-                       }
-               }
-               if d.length >= minMatchLength {
-                       d.ii = 0
-                       // There was a match at the previous step, and the current match is
-                       // not better. Output the previous match.
-                       // "d.length-3" should NOT be "d.length-minMatchLength", since the format always assume 3
-                       d.tokens.tokens[d.tokens.n] = matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize))
-                       d.tokens.n++
-                       // Insert in the hash table all strings up to the end of the match.
-                       // index and index-1 are already inserted. If there is not enough
-                       // lookahead, the last two strings are not inserted into the hash
-                       // table.
-                       if d.length <= d.fastSkipHashing {
-                               var newIndex int
-                               newIndex = d.index + d.length
-                               // Calculate missing hashes
-                               end := newIndex
-                               if end > d.maxInsertIndex {
-                                       end = d.maxInsertIndex
-                               }
-                               end += minMatchLength - 1
-                               startindex := d.index + 1
-                               if startindex > d.maxInsertIndex {
-                                       startindex = d.maxInsertIndex
-                               }
-                               tocheck := d.window[startindex:end]
-                               dstSize := len(tocheck) - minMatchLength + 1
-                               if dstSize > 0 {
-                                       dst := d.hashMatch[:dstSize]
-
-                                       crc32sseAll(tocheck, dst)
-                                       var newH uint32
-                                       for i, val := range dst {
-                                               di := i + startindex
-                                               newH = val & hashMask
-                                               // Get previous value with the same hash.
-                                               // Our chain should point to the previous value.
-                                               d.hashPrev[di&windowMask] = d.hashHead[newH]
-                                               // Set the head of the hash chain to us.
-                                               d.hashHead[newH] = uint32(di + d.hashOffset)
-                                       }
-                                       d.hash = newH
-                               }
-                               d.index = newIndex
-                       } else {
-                               // For matches this long, we don't bother inserting each individual
-                               // item into the table.
-                               d.index += d.length
-                               if d.index < d.maxInsertIndex {
-                                       d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-                               }
-                       }
-                       if d.tokens.n == maxFlateBlockTokens {
-                               // The block includes the current character
-                               if d.err = d.writeBlockSkip(d.tokens, d.index, false); d.err != nil {
-                                       return
-                               }
-                               d.tokens.n = 0
-                       }
-               } else {
-                       d.ii++
-                       end := d.index + int(d.ii>>5) + 1
-                       if end > d.windowEnd {
-                               end = d.windowEnd
-                       }
-                       for i := d.index; i < end; i++ {
-                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[i]))
-                               d.tokens.n++
-                               if d.tokens.n == maxFlateBlockTokens {
-                                       if d.err = d.writeBlockSkip(d.tokens, i+1, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                       }
-                       d.index = end
-               }
-       }
-}
-
-// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
-// meaning it always has lazy matching on.
-func (d *compressor) deflateLazySSE() {
-       // Sanity enables additional runtime tests.
-       // It's intended to be used during development
-       // to supplement the currently ad-hoc unit tests.
-       const sanity = false
-
-       if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
-               return
-       }
-
-       d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-       if d.index < d.maxInsertIndex {
-               d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-       }
-
-       for {
-               if sanity && d.index > d.windowEnd {
-                       panic("index > windowEnd")
-               }
-               lookahead := d.windowEnd - d.index
-               if lookahead < minMatchLength+maxMatchLength {
-                       if !d.sync {
-                               return
-                       }
-                       if sanity && d.index > d.windowEnd {
-                               panic("index > windowEnd")
-                       }
-                       if lookahead == 0 {
-                               // Flush current output block if any.
-                               if d.byteAvailable {
-                                       // There is still one pending token that needs to be flushed
-                                       d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                       d.tokens.n++
-                                       d.byteAvailable = false
-                               }
-                               if d.tokens.n > 0 {
-                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
-                                               return
-                                       }
-                                       d.tokens.n = 0
-                               }
-                               return
-                       }
-               }
-               if d.index < d.maxInsertIndex {
-                       // Update the hash
-                       d.hash = crc32sse(d.window[d.index:d.index+minMatchLength]) & hashMask
-                       ch := d.hashHead[d.hash]
-                       d.chainHead = int(ch)
-                       d.hashPrev[d.index&windowMask] = ch
-                       d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
-               }
-               prevLength := d.length
-               prevOffset := d.offset
-               d.length = minMatchLength - 1
-               d.offset = 0
-               minIndex := d.index - windowSize
-               if minIndex < 0 {
-                       minIndex = 0
-               }
-
-               if d.chainHead-d.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-                       if newLength, newOffset, ok := d.findMatchSSE(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-                               d.length = newLength
-                               d.offset = newOffset
-                       }
-               }
-               if prevLength >= minMatchLength && d.length <= prevLength {
-                       // There was a match at the previous step, and the current match is
-                       // not better. Output the previous match.
-                       d.tokens.tokens[d.tokens.n] = matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
-                       d.tokens.n++
-
-                       // Insert in the hash table all strings up to the end of the match.
-                       // index and index-1 are already inserted. If there is not enough
-                       // lookahead, the last two strings are not inserted into the hash
-                       // table.
-                       var newIndex int
-                       newIndex = d.index + prevLength - 1
-                       // Calculate missing hashes
-                       end := newIndex
-                       if end > d.maxInsertIndex {
-                               end = d.maxInsertIndex
-                       }
-                       end += minMatchLength - 1
-                       startindex := d.index + 1
-                       if startindex > d.maxInsertIndex {
-                               startindex = d.maxInsertIndex
-                       }
-                       tocheck := d.window[startindex:end]
-                       dstSize := len(tocheck) - minMatchLength + 1
-                       if dstSize > 0 {
-                               dst := d.hashMatch[:dstSize]
-                               crc32sseAll(tocheck, dst)
-                               var newH uint32
-                               for i, val := range dst {
-                                       di := i + startindex
-                                       newH = val & hashMask
-                                       // Get previous value with the same hash.
-                                       // Our chain should point to the previous value.
-                                       d.hashPrev[di&windowMask] = d.hashHead[newH]
+                                       s.hashPrev[di&windowMask] = s.hashHead[newH]
                                        // Set the head of the hash chain to us.
-                                       d.hashHead[newH] = uint32(di + d.hashOffset)
+                                       s.hashHead[newH] = uint32(di + s.hashOffset)
                                }
-                               d.hash = newH
+                               s.hash = newH
                        }
 
-                       d.index = newIndex
+                       s.index = newIndex
                        d.byteAvailable = false
-                       d.length = minMatchLength - 1
+                       s.length = minMatchLength - 1
                        if d.tokens.n == maxFlateBlockTokens {
                                // The block includes the current character
-                               if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                               if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
                                        return
                                }
-                               d.tokens.n = 0
+                               d.tokens.Reset()
                        }
                } else {
                        // Reset, if we got a match this run.
-                       if d.length >= minMatchLength {
-                               d.ii = 0
+                       if s.length >= minMatchLength {
+                               s.ii = 0
                        }
                        // We have a byte waiting. Emit it.
                        if d.byteAvailable {
-                               d.ii++
-                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                               d.tokens.n++
+                               s.ii++
+                               d.tokens.AddLiteral(d.window[s.index-1])
                                if d.tokens.n == maxFlateBlockTokens {
-                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                                       if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
                                                return
                                        }
-                                       d.tokens.n = 0
+                                       d.tokens.Reset()
                                }
-                               d.index++
+                               s.index++
 
                                // If we have a long run of no matches, skip additional bytes
-                               // Resets when d.ii overflows after 64KB.
-                               if d.ii > 31 {
-                                       n := int(d.ii >> 6)
+                               // Resets when s.ii overflows after 64KB.
+                               if s.ii > 31 {
+                                       n := int(s.ii >> 5)
                                        for j := 0; j < n; j++ {
-                                               if d.index >= d.windowEnd-1 {
+                                               if s.index >= d.windowEnd-1 {
                                                        break
                                                }
 
-                                               d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                               d.tokens.n++
+                                               d.tokens.AddLiteral(d.window[s.index-1])
                                                if d.tokens.n == maxFlateBlockTokens {
-                                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                                                       if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
                                                                return
                                                        }
-                                                       d.tokens.n = 0
+                                                       d.tokens.Reset()
                                                }
-                                               d.index++
+                                               s.index++
                                        }
                                        // Flush last byte
-                                       d.tokens.tokens[d.tokens.n] = literalToken(uint32(d.window[d.index-1]))
-                                       d.tokens.n++
+                                       d.tokens.AddLiteral(d.window[s.index-1])
                                        d.byteAvailable = false
-                                       // d.length = minMatchLength - 1 // not needed, since d.ii is reset above, so it should never be > minMatchLength
+                                       // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
                                        if d.tokens.n == maxFlateBlockTokens {
-                                               if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                                               if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
                                                        return
                                                }
-                                               d.tokens.n = 0
+                                               d.tokens.Reset()
                                        }
                                }
                        } else {
-                               d.index++
+                               s.index++
                                d.byteAvailable = true
                        }
                }
@@ -1070,17 +555,17 @@ func (d *compressor) storeHuff() {
        if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
                return
        }
-       d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+       d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
        d.err = d.w.err
        d.windowEnd = 0
 }
 
-// storeHuff will compress and store the currently added data,
+// storeFast will compress and store the currently added data,
 // if enough has been accumulated or we at the end of the stream.
 // Any error that occurred will be in d.err
-func (d *compressor) storeSnappy() {
+func (d *compressor) storeFast() {
        // We only compress if we have maxStoreBlockSize.
-       if d.windowEnd < maxStoreBlockSize {
+       if d.windowEnd < len(d.window) {
                if !d.sync {
                        return
                }
@@ -1091,32 +576,30 @@ func (d *compressor) storeSnappy() {
                        }
                        if d.windowEnd <= 32 {
                                d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-                               d.tokens.n = 0
-                               d.windowEnd = 0
                        } else {
-                               d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+                               d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
                                d.err = d.w.err
                        }
-                       d.tokens.n = 0
+                       d.tokens.Reset()
                        d.windowEnd = 0
-                       d.snap.Reset()
+                       d.fast.Reset()
                        return
                }
        }
 
-       d.snap.Encode(&d.tokens, d.window[:d.windowEnd])
+       d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
        // If we made zero matches, store the block as is.
-       if int(d.tokens.n) == d.windowEnd {
+       if d.tokens.n == 0 {
                d.err = d.writeStoredBlock(d.window[:d.windowEnd])
                // If we removed less than 1/16th, huffman compress the block.
        } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
-               d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+               d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
                d.err = d.w.err
        } else {
-               d.w.writeBlockDynamic(d.tokens.tokens[:d.tokens.n], false, d.window[:d.windowEnd])
+               d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
                d.err = d.w.err
        }
-       d.tokens.n = 0
+       d.tokens.Reset()
        d.windowEnd = 0
 }
 
@@ -1161,35 +644,26 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
                d.fill = (*compressor).fillBlock
                d.step = (*compressor).store
        case level == ConstantCompression:
+               d.w.logReusePenalty = uint(4)
                d.window = make([]byte, maxStoreBlockSize)
                d.fill = (*compressor).fillBlock
                d.step = (*compressor).storeHuff
-       case level >= 1 && level <= 4:
-               d.snap = newSnappy(level)
-               d.window = make([]byte, maxStoreBlockSize)
-               d.fill = (*compressor).fillBlock
-               d.step = (*compressor).storeSnappy
        case level == DefaultCompression:
                level = 5
                fallthrough
-       case 5 <= level && level <= 9:
+       case level >= 1 && level <= 6:
+               d.w.logReusePenalty = uint(level + 1)
+               d.fast = newFastEnc(level)
+               d.window = make([]byte, maxStoreBlockSize)
+               d.fill = (*compressor).fillBlock
+               d.step = (*compressor).storeFast
+       case 7 <= level && level <= 9:
+               d.w.logReusePenalty = uint(level)
+               d.state = &advancedState{}
                d.compressionLevel = levels[level]
                d.initDeflate()
                d.fill = (*compressor).fillDeflate
-               if d.fastSkipHashing == skipNever {
-                       if useSSE42 {
-                               d.step = (*compressor).deflateLazySSE
-                       } else {
-                               d.step = (*compressor).deflateLazy
-                       }
-               } else {
-                       if useSSE42 {
-                               d.step = (*compressor).deflateSSE
-                       } else {
-                               d.step = (*compressor).deflate
-
-                       }
-               }
+               d.step = (*compressor).deflateLazy
        default:
                return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
        }
@@ -1202,10 +676,10 @@ func (d *compressor) reset(w io.Writer) {
        d.sync = false
        d.err = nil
        // We only need to reset a few things for Snappy.
-       if d.snap != nil {
-               d.snap.Reset()
+       if d.fast != nil {
+               d.fast.Reset()
                d.windowEnd = 0
-               d.tokens.n = 0
+               d.tokens.Reset()
                return
        }
        switch d.compressionLevel.chain {
@@ -1213,22 +687,23 @@ func (d *compressor) reset(w io.Writer) {
                // level was NoCompression or ConstantCompresssion.
                d.windowEnd = 0
        default:
-               d.chainHead = -1
-               for i := range d.hashHead {
-                       d.hashHead[i] = 0
+               s := d.state
+               s.chainHead = -1
+               for i := range s.hashHead {
+                       s.hashHead[i] = 0
                }
-               for i := range d.hashPrev {
-                       d.hashPrev[i] = 0
+               for i := range s.hashPrev {
+                       s.hashPrev[i] = 0
                }
-               d.hashOffset = 1
-               d.index, d.windowEnd = 0, 0
+               s.hashOffset = 1
+               s.index, d.windowEnd = 0, 0
                d.blockStart, d.byteAvailable = 0, false
-               d.tokens.n = 0
-               d.length = minMatchLength - 1
-               d.offset = 0
-               d.hash = 0
-               d.ii = 0
-               d.maxInsertIndex = 0
+               d.tokens.Reset()
+               s.length = minMatchLength - 1
+               s.offset = 0
+               s.hash = 0
+               s.ii = 0
+               s.maxInsertIndex = 0
        }
 }
 
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
new file mode 100644 (file)
index 0000000..b0a470f
--- /dev/null
@@ -0,0 +1,257 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+       "fmt"
+       "math/bits"
+)
+
+type fastEnc interface {
+       Encode(dst *tokens, src []byte)
+       Reset()
+}
+
+func newFastEnc(level int) fastEnc {
+       switch level {
+       case 1:
+               return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+       case 2:
+               return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+       case 3:
+               return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+       case 4:
+               return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+       case 5:
+               return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+       case 6:
+               return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+       default:
+               panic("invalid level specified")
+       }
+}
+
+const (
+       tableBits       = 16             // Bits used in the table
+       tableSize       = 1 << tableBits // Size of the table
+       tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+       baseMatchOffset = 1              // The smallest match offset
+       baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+       maxMatchOffset  = 1 << 15        // The largest match offset
+
+       bTableBits   = 18                                           // Bits used in the big tables
+       bTableSize   = 1 << bTableBits                              // Size of the table
+       allocHistory = maxMatchOffset * 10                          // Size to preallocate for history.
+       bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this.
+)
+
+const (
+       prime3bytes = 506832829
+       prime4bytes = 2654435761
+       prime5bytes = 889523592379
+       prime6bytes = 227718039650203
+       prime7bytes = 58295818150454627
+       prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+func load32(b []byte, i int) uint32 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:4]
+       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:8]
+       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func load3232(b []byte, i int32) uint32 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:4]
+       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:8]
+       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u uint32) uint32 {
+       return (u * 0x1e35a7bd) >> tableShift
+}
+
+type tableEntry struct {
+       val    uint32
+       offset int32
+}
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+       hist []byte
+       cur  int32
+}
+
+func (e *fastGen) addBlock(src []byte) int32 {
+       // check if we have space already
+       if len(e.hist)+len(src) > cap(e.hist) {
+               if cap(e.hist) == 0 {
+                       e.hist = make([]byte, 0, allocHistory)
+               } else {
+                       if cap(e.hist) < maxMatchOffset*2 {
+                               panic("unexpected buffer size")
+                       }
+                       // Move down
+                       offset := int32(len(e.hist)) - maxMatchOffset
+                       copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+                       e.cur += offset
+                       e.hist = e.hist[:maxMatchOffset]
+               }
+       }
+       s := int32(len(e.hist))
+       e.hist = append(e.hist, src...)
+       return s
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+       return (u * prime4bytes) >> ((32 - h) & 31)
+}
+
+type tableEntryPrev struct {
+       Cur  tableEntry
+       Prev tableEntry
+}
+
+// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4x64(u uint64, h uint8) uint32 {
+       return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+       return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+       return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+       return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
+       if debugDecode {
+               if t >= s {
+                       panic(fmt.Sprint("t >=s:", t, s))
+               }
+               if int(s) >= len(src) {
+                       panic(fmt.Sprint("s >= len(src):", s, len(src)))
+               }
+               if t < 0 {
+                       panic(fmt.Sprint("t < 0:", t))
+               }
+               if s-t > maxMatchOffset {
+                       panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+               }
+       }
+       s1 := int(s) + maxMatchLength - 4
+       if s1 > len(src) {
+               s1 = len(src)
+       }
+
+       // Extend the match to be as long as possible.
+       return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+       if debugDecode {
+               if t >= s {
+                       panic(fmt.Sprint("t >=s:", t, s))
+               }
+               if int(s) >= len(src) {
+                       panic(fmt.Sprint("s >= len(src):", s, len(src)))
+               }
+               if t < 0 {
+                       panic(fmt.Sprint("t < 0:", t))
+               }
+               if s-t > maxMatchOffset {
+                       panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+               }
+       }
+       // Extend the match to be as long as possible.
+       return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+       if cap(e.hist) < int(maxMatchOffset*8) {
+               l := maxMatchOffset * 8
+               // Make it at least 1MB.
+               if l < 1<<20 {
+                       l = 1 << 20
+               }
+               e.hist = make([]byte, 0, l)
+       }
+       // We offset current position so everything will be out of reach
+       e.cur += maxMatchOffset + int32(len(e.hist))
+       e.hist = e.hist[:0]
+}
+
+// matchLen returns the maximum length.
+// 'a' must be the shortest of the two.
+func matchLen(a, b []byte) int {
+       b = b[:len(a)]
+       var checked int
+       if len(a) > 4 {
+               // Try 4 bytes first
+               if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+                       return bits.TrailingZeros32(diff) >> 3
+               }
+               // Switch to 8 byte matching.
+               checked = 4
+               a = a[4:]
+               b = b[4:]
+               for len(a) >= 8 {
+                       b = b[:len(a)]
+                       if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+                               return checked + (bits.TrailingZeros64(diff) >> 3)
+                       }
+                       checked += 8
+                       a = a[8:]
+                       b = b[8:]
+               }
+       }
+       b = b[:len(a)]
+       for i := range a {
+               if a[i] != b[i] {
+                       return int(i) + checked
+               }
+       }
+       return len(a) + checked
+}
index f9b2a699a3ddabe31ffbeaf441c76b32c2864b66..5ed476aa0dcb7d88105dd08343995a968d29374e 100644 (file)
@@ -35,7 +35,7 @@ const (
 )
 
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
+var lengthExtraBits = [32]int8{
        /* 257 */ 0, 0, 0,
        /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
        /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -43,14 +43,14 @@ var lengthExtraBits = []int8{
 }
 
 // The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
+var lengthBase = [32]uint8{
        0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
        12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
        64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
 // offset code word extra bits.
-var offsetExtraBits = []int8{
+var offsetExtraBits = [64]int8{
        0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
        4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
        9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
@@ -58,7 +58,7 @@ var offsetExtraBits = []int8{
        14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
 }
 
-var offsetBase = []uint32{
+var offsetBase = [64]uint32{
        /* normal deflate */
        0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
        0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
@@ -85,26 +85,48 @@ type huffmanBitWriter struct {
        // Data waiting to be written is bytes[0:nbytes]
        // and then the low nbits of bits.
        bits            uint64
-       nbits           uint
-       bytes           [bufferSize]byte
-       codegenFreq     [codegenCodeCount]int32
-       nbytes          int
-       literalFreq     []int32
-       offsetFreq      []int32
-       codegen         []uint8
+       nbits           uint16
+       nbytes          uint8
        literalEncoding *huffmanEncoder
        offsetEncoding  *huffmanEncoder
        codegenEncoding *huffmanEncoder
        err             error
+       lastHeader      int
+       // Set between 0 (reused block can be up to 2x the size)
+       logReusePenalty uint
+       lastHuffMan     bool
+       bytes           [256]byte
+       literalFreq     [lengthCodesStart + 32]uint16
+       offsetFreq      [32]uint16
+       codegenFreq     [codegenCodeCount]uint16
+
+       // codegen must have an extra space for the final symbol.
+       codegen [literalCount + offsetCodeCount + 1]uint8
 }
 
+// Huffman reuse.
+//
+// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
+//
+// This is controlled by several variables:
+//
+// If lastHeader is non-zero the Huffman table can be reused.
+// This also indicates that a Huffman table has been generated that can output all
+// possible symbols.
+// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
+// an EOB with the previous table must be written.
+//
+// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
+// optimal size and adding a penalty in 'logReusePenalty'.
+// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
+// is slower both for compression and decompression.
+
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
        return &huffmanBitWriter{
                writer:          w,
-               literalFreq:     make([]int32, maxNumLit),
-               offsetFreq:      make([]int32, offsetCodeCount),
-               codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-               literalEncoding: newHuffmanEncoder(maxNumLit),
+               literalEncoding: newHuffmanEncoder(literalCount),
                codegenEncoding: newHuffmanEncoder(codegenCodeCount),
                offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
        }
@@ -113,7 +135,42 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
        w.writer = writer
        w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
-       w.bytes = [bufferSize]byte{}
+       w.bytes = [256]byte{}
+       w.lastHeader = 0
+       w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
+       offsets, lits = true, true
+       a := t.offHist[:offsetCodeCount]
+       b := w.offsetFreq[:len(a)]
+       for i := range a {
+               if b[i] == 0 && a[i] != 0 {
+                       offsets = false
+                       break
+               }
+       }
+
+       a = t.extraHist[:literalCount-256]
+       b = w.literalFreq[256:literalCount]
+       b = b[:len(a)]
+       for i := range a {
+               if b[i] == 0 && a[i] != 0 {
+                       lits = false
+                       break
+               }
+       }
+       if lits {
+               a = t.litHist[:]
+               b = w.literalFreq[:len(a)]
+               for i := range a {
+                       if b[i] == 0 && a[i] != 0 {
+                               lits = false
+                               break
+                       }
+               }
+       }
+       return
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -144,30 +201,11 @@ func (w *huffmanBitWriter) write(b []byte) {
        _, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
-       if w.err != nil {
-               return
-       }
-       w.bits |= uint64(b) << w.nbits
+func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
+       w.bits |= uint64(b) << (w.nbits & 63)
        w.nbits += nb
        if w.nbits >= 48 {
-               bits := w.bits
-               w.bits >>= 48
-               w.nbits -= 48
-               n := w.nbytes
-               bytes := w.bytes[n : n+6]
-               bytes[0] = byte(bits)
-               bytes[1] = byte(bits >> 8)
-               bytes[2] = byte(bits >> 16)
-               bytes[3] = byte(bits >> 24)
-               bytes[4] = byte(bits >> 32)
-               bytes[5] = byte(bits >> 40)
-               n += 6
-               if n >= bufferFlushSize {
-                       w.write(w.bytes[:n])
-                       n = 0
-               }
-               w.nbytes = n
+               w.writeOutBits()
        }
 }
 
@@ -213,7 +251,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
        // a copy of the frequencies, and as the place where we put the result.
        // This is fine because the output is always shorter than the input used
        // so far.
-       codegen := w.codegen // cache
+       codegen := w.codegen[:] // cache
        // Copy the concatenated code sizes to codegen. Put a marker at the end.
        cgnl := codegen[:numLiterals]
        for i := range cgnl {
@@ -292,30 +330,54 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
        codegen[outIndex] = badCode
 }
 
-// dynamicSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+func (w *huffmanBitWriter) codegens() int {
+       numCodegens := len(w.codegenFreq)
+       for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+               numCodegens--
+       }
+       return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
        numCodegens = len(w.codegenFreq)
        for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
                numCodegens--
        }
-       header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+       return 3 + 5 + 5 + 4 + (3 * numCodegens) +
                w.codegenEncoding.bitLength(w.codegenFreq[:]) +
                int(w.codegenFreq[16])*2 +
                int(w.codegenFreq[17])*3 +
-               int(w.codegenFreq[18])*7
+               int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+       header, numCodegens := w.headerSize()
        size = header +
-               litEnc.bitLength(w.literalFreq) +
-               offEnc.bitLength(w.offsetFreq) +
+               litEnc.bitLength(w.literalFreq[:]) +
+               offEnc.bitLength(w.offsetFreq[:]) +
                extraBits
-
        return size, numCodegens
 }
 
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+       total := 0
+       for i, n := range w.literalFreq[257:literalCount] {
+               total += int(n) * int(lengthExtraBits[i&31])
+       }
+       for i, n := range w.offsetFreq[:offsetCodeCount] {
+               total += int(n) * int(offsetExtraBits[i&31])
+       }
+       return total
+}
+
 // fixedSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) fixedSize(extraBits int) int {
        return 3 +
-               fixedLiteralEncoding.bitLength(w.literalFreq) +
-               fixedOffsetEncoding.bitLength(w.offsetFreq) +
+               fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+               fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
                extraBits
 }
 
@@ -333,30 +395,36 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 }
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
-       if w.err != nil {
-               return
-       }
+       // The function does not get inlined if we "& 63" the shift.
        w.bits |= uint64(c.code) << w.nbits
-       w.nbits += uint(c.len)
+       w.nbits += c.len
        if w.nbits >= 48 {
-               bits := w.bits
-               w.bits >>= 48
-               w.nbits -= 48
-               n := w.nbytes
-               bytes := w.bytes[n : n+6]
-               bytes[0] = byte(bits)
-               bytes[1] = byte(bits >> 8)
-               bytes[2] = byte(bits >> 16)
-               bytes[3] = byte(bits >> 24)
-               bytes[4] = byte(bits >> 32)
-               bytes[5] = byte(bits >> 40)
-               n += 6
-               if n >= bufferFlushSize {
-                       w.write(w.bytes[:n])
+               w.writeOutBits()
+       }
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+       bits := w.bits
+       w.bits >>= 48
+       w.nbits -= 48
+       n := w.nbytes
+       w.bytes[n] = byte(bits)
+       w.bytes[n+1] = byte(bits >> 8)
+       w.bytes[n+2] = byte(bits >> 16)
+       w.bytes[n+3] = byte(bits >> 24)
+       w.bytes[n+4] = byte(bits >> 32)
+       w.bytes[n+5] = byte(bits >> 40)
+       n += 6
+       if n >= bufferFlushSize {
+               if w.err != nil {
                        n = 0
+                       return
                }
-               w.nbytes = n
+               w.write(w.bytes[:n])
+               n = 0
        }
+       w.nbytes = n
 }
 
 // Write the header of a dynamic Huffman block to the output stream.
@@ -412,6 +480,11 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
        if w.err != nil {
                return
        }
+       if w.lastHeader > 0 {
+               // We owe an EOB
+               w.writeCode(w.literalEncoding.codes[endBlockMarker])
+               w.lastHeader = 0
+       }
        var flag int32
        if isEof {
                flag = 1
@@ -426,6 +499,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
        if w.err != nil {
                return
        }
+       if w.lastHeader > 0 {
+               // We owe an EOB
+               w.writeCode(w.literalEncoding.codes[endBlockMarker])
+               w.lastHeader = 0
+       }
+
        // Indicate that we are a fixed Huffman block
        var value int32 = 2
        if isEof {
@@ -439,29 +518,23 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 // is larger than the original bytes, the data will be written as a
 // stored block.
 // If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
        if w.err != nil {
                return
        }
 
-       tokens = append(tokens, endBlockMarker)
-       numLiterals, numOffsets := w.indexTokens(tokens)
-
+       tokens.AddEOB()
+       if w.lastHeader > 0 {
+               // We owe an EOB
+               w.writeCode(w.literalEncoding.codes[endBlockMarker])
+               w.lastHeader = 0
+       }
+       numLiterals, numOffsets := w.indexTokens(tokens, false)
+       w.generate(tokens)
        var extraBits int
        storedSize, storable := w.storedSize(input)
        if storable {
-               // We only bother calculating the costs of the extra bits required by
-               // the length of offset fields (which will be the same for both fixed
-               // and dynamic encoding), if we need to compare those two encodings
-               // against stored encoding.
-               for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
-                       // First eight length codes have extra size = 0.
-                       extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
-               }
-               for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
-                       // First four offset codes have extra size = 0.
-                       extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
-               }
+               extraBits = w.extraBitSize()
        }
 
        // Figure out smallest code.
@@ -500,7 +573,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
        }
 
        // Write the tokens.
-       w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+       w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
 }
 
 // writeBlockDynamic encodes a block using a dynamic Huffman table.
@@ -508,57 +581,103 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 // histogram distribution.
 // If input is supplied and the compression savings are below 1/16th of the
 // input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
        if w.err != nil {
                return
        }
 
-       tokens = append(tokens, endBlockMarker)
-       numLiterals, numOffsets := w.indexTokens(tokens)
+       sync = sync || eof
+       if sync {
+               tokens.AddEOB()
+       }
 
-       // Generate codegen and codegenFrequencies, which indicates how to encode
-       // the literalEncoding and the offsetEncoding.
-       w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-       w.codegenEncoding.generate(w.codegenFreq[:], 7)
-       size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+       // We cannot reuse pure huffman table.
+       if w.lastHuffMan && w.lastHeader > 0 {
+               // We will not try to reuse.
+               w.writeCode(w.literalEncoding.codes[endBlockMarker])
+               w.lastHeader = 0
+               w.lastHuffMan = false
+       }
+       if !sync {
+               tokens.Fill()
+       }
+       numLiterals, numOffsets := w.indexTokens(tokens, !sync)
 
-       // Store bytes, if we don't get a reasonable improvement.
-       if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-               w.writeStoredHeader(len(input), eof)
-               w.writeBytes(input)
-               return
+       var size int
+       // Check if we should reuse.
+       if w.lastHeader > 0 {
+               // Estimate size for using a new table
+               newSize := w.lastHeader + tokens.EstimatedBits()
+
+               // The estimated size is calculated as an optimal table.
+               // We add a penalty to make it more realistic and re-use a bit more.
+               newSize += newSize >> (w.logReusePenalty & 31)
+               extra := w.extraBitSize()
+               reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra)
+
+               // Check if a new table is better.
+               if newSize < reuseSize {
+                       // Write the EOB we owe.
+                       w.writeCode(w.literalEncoding.codes[endBlockMarker])
+                       size = newSize
+                       w.lastHeader = 0
+               } else {
+                       size = reuseSize
+               }
+               // Check if we get a reasonable size decrease.
+               if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+                       w.writeStoredHeader(len(input), eof)
+                       w.writeBytes(input)
+                       w.lastHeader = 0
+                       return
+               }
        }
 
-       // Write Huffman table.
-       w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+       // We want a new block/table
+       if w.lastHeader == 0 {
+               w.generate(tokens)
+               // Generate codegen and codegenFrequencies, which indicates how to encode
+               // the literalEncoding and the offsetEncoding.
+               w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+               w.codegenEncoding.generate(w.codegenFreq[:], 7)
+               var numCodegens int
+               size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
+               // Store bytes, if we don't get a reasonable improvement.
+               if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+                       w.writeStoredHeader(len(input), eof)
+                       w.writeBytes(input)
+                       w.lastHeader = 0
+                       return
+               }
 
+               // Write Huffman table.
+               w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+               w.lastHeader, _ = w.headerSize()
+               w.lastHuffMan = false
+       }
+
+       if sync {
+               w.lastHeader = 0
+       }
        // Write the tokens.
-       w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+       w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
-       for i := range w.literalFreq {
-               w.literalFreq[i] = 0
-       }
-       for i := range w.offsetFreq {
-               w.offsetFreq[i] = 0
-       }
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
+       copy(w.literalFreq[:], t.litHist[:])
+       copy(w.literalFreq[256:], t.extraHist[:])
+       copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
 
-       for _, t := range tokens {
-               if t < matchType {
-                       w.literalFreq[t.literal()]++
-                       continue
-               }
-               length := t.length()
-               offset := t.offset()
-               w.literalFreq[lengthCodesStart+lengthCode(length)]++
-               w.offsetFreq[offsetCode(offset)]++
+       if t.n == 0 {
+               return
+       }
+       if filled {
+               return maxNumLit, maxNumDist
        }
-
        // get the number of literals
        numLiterals = len(w.literalFreq)
        for w.literalFreq[numLiterals-1] == 0 {
@@ -575,41 +694,85 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets
                w.offsetFreq[0] = 1
                numOffsets = 1
        }
-       w.literalEncoding.generate(w.literalFreq, 15)
-       w.offsetEncoding.generate(w.offsetFreq, 15)
        return
 }
 
+func (w *huffmanBitWriter) generate(t *tokens) {
+       w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+       w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
 // writeTokens writes a slice of tokens to the output.
 // codes for literal and offset encoding must be supplied.
 func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
        if w.err != nil {
                return
        }
+       if len(tokens) == 0 {
+               return
+       }
+
+       // Only last token should be endBlockMarker.
+       var deferEOB bool
+       if tokens[len(tokens)-1] == endBlockMarker {
+               tokens = tokens[:len(tokens)-1]
+               deferEOB = true
+       }
+
+       // Create slices up to the next power of two to avoid bounds checks.
+       lits := leCodes[:256]
+       offs := oeCodes[:32]
+       lengths := leCodes[lengthCodesStart:]
+       lengths = lengths[:32]
        for _, t := range tokens {
                if t < matchType {
-                       w.writeCode(leCodes[t.literal()])
+                       w.writeCode(lits[t.literal()])
                        continue
                }
+
                // Write the length
                length := t.length()
                lengthCode := lengthCode(length)
-               w.writeCode(leCodes[lengthCode+lengthCodesStart])
-               extraLengthBits := uint(lengthExtraBits[lengthCode])
+               if false {
+                       w.writeCode(lengths[lengthCode&31])
+               } else {
+                       // inlined
+                       c := lengths[lengthCode&31]
+                       w.bits |= uint64(c.code) << (w.nbits & 63)
+                       w.nbits += c.len
+                       if w.nbits >= 48 {
+                               w.writeOutBits()
+                       }
+               }
+
+               extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
                if extraLengthBits > 0 {
-                       extraLength := int32(length - lengthBase[lengthCode])
+                       extraLength := int32(length - lengthBase[lengthCode&31])
                        w.writeBits(extraLength, extraLengthBits)
                }
                // Write the offset
                offset := t.offset()
                offsetCode := offsetCode(offset)
-               w.writeCode(oeCodes[offsetCode])
-               extraOffsetBits := uint(offsetExtraBits[offsetCode])
+               if false {
+                       w.writeCode(offs[offsetCode&31])
+               } else {
+                       // inlined
+                       c := offs[offsetCode&31]
+                       w.bits |= uint64(c.code) << (w.nbits & 63)
+                       w.nbits += c.len
+                       if w.nbits >= 48 {
+                               w.writeOutBits()
+                       }
+               }
+               extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
                if extraOffsetBits > 0 {
-                       extraOffset := int32(offset - offsetBase[offsetCode])
+                       extraOffset := int32(offset - offsetBase[offsetCode&63])
                        w.writeBits(extraOffset, extraOffsetBits)
                }
        }
+       if deferEOB {
+               w.writeCode(leCodes[endBlockMarker])
+       }
 }
 
 // huffOffset is a static offset encoder used for huffman only encoding.
@@ -620,82 +783,99 @@ func init() {
        w := newHuffmanBitWriter(nil)
        w.offsetFreq[0] = 1
        huffOffset = newHuffmanEncoder(offsetCodeCount)
-       huffOffset.generate(w.offsetFreq, 15)
+       huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
 
 // writeBlockHuff encodes a block of bytes as either
 // Huffman encoded literals or uncompressed bytes if the
 // results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
        if w.err != nil {
                return
        }
 
        // Clear histogram
-       for i := range w.literalFreq {
+       for i := range w.literalFreq[:] {
                w.literalFreq[i] = 0
        }
+       if !w.lastHuffMan {
+               for i := range w.offsetFreq[:] {
+                       w.offsetFreq[i] = 0
+               }
+       }
 
        // Add everything as literals
-       histogram(input, w.literalFreq)
+       estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15
 
-       w.literalFreq[endBlockMarker] = 1
+       // Store bytes, if we don't get a reasonable improvement.
+       ssize, storable := w.storedSize(input)
+       if storable && ssize < (estBits+estBits>>4) {
+               w.writeStoredHeader(len(input), eof)
+               w.writeBytes(input)
+               return
+       }
 
-       const numLiterals = endBlockMarker + 1
-       const numOffsets = 1
+       if w.lastHeader > 0 {
+               size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader)
+               estBits += estBits >> (w.logReusePenalty)
 
-       w.literalEncoding.generate(w.literalFreq, 15)
+               if estBits < size {
+                       // We owe an EOB
+                       w.writeCode(w.literalEncoding.codes[endBlockMarker])
+                       w.lastHeader = 0
+               }
+       }
 
-       // Figure out smallest code.
-       // Always use dynamic Huffman or Store
-       var numCodegens int
+       const numLiterals = endBlockMarker + 1
+       const numOffsets = 1
+       if w.lastHeader == 0 {
+               w.literalFreq[endBlockMarker] = 1
+               w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
 
-       // Generate codegen and codegenFrequencies, which indicates how to encode
-       // the literalEncoding and the offsetEncoding.
-       w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-       w.codegenEncoding.generate(w.codegenFreq[:], 7)
-       size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+               // Generate codegen and codegenFrequencies, which indicates how to encode
+               // the literalEncoding and the offsetEncoding.
+               w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+               w.codegenEncoding.generate(w.codegenFreq[:], 7)
+               numCodegens := w.codegens()
 
-       // Store bytes, if we don't get a reasonable improvement.
-       if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-               w.writeStoredHeader(len(input), eof)
-               w.writeBytes(input)
-               return
+               // Huffman.
+               w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+               w.lastHuffMan = true
+               w.lastHeader, _ = w.headerSize()
        }
 
-       // Huffman.
-       w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
        encoding := w.literalEncoding.codes[:257]
-       n := w.nbytes
        for _, t := range input {
                // Bitwriting inlined, ~30% speedup
                c := encoding[t]
-               w.bits |= uint64(c.code) << w.nbits
-               w.nbits += uint(c.len)
-               if w.nbits < 48 {
-                       continue
-               }
-               // Store 6 bytes
-               bits := w.bits
-               w.bits >>= 48
-               w.nbits -= 48
-               bytes := w.bytes[n : n+6]
-               bytes[0] = byte(bits)
-               bytes[1] = byte(bits >> 8)
-               bytes[2] = byte(bits >> 16)
-               bytes[3] = byte(bits >> 24)
-               bytes[4] = byte(bits >> 32)
-               bytes[5] = byte(bits >> 40)
-               n += 6
-               if n < bufferFlushSize {
-                       continue
-               }
-               w.write(w.bytes[:n])
-               if w.err != nil {
-                       return // Return early in the event of write failures
+               w.bits |= uint64(c.code) << ((w.nbits) & 63)
+               w.nbits += c.len
+               if w.nbits >= 48 {
+                       bits := w.bits
+                       w.bits >>= 48
+                       w.nbits -= 48
+                       n := w.nbytes
+                       w.bytes[n] = byte(bits)
+                       w.bytes[n+1] = byte(bits >> 8)
+                       w.bytes[n+2] = byte(bits >> 16)
+                       w.bytes[n+3] = byte(bits >> 24)
+                       w.bytes[n+4] = byte(bits >> 32)
+                       w.bytes[n+5] = byte(bits >> 40)
+                       n += 6
+                       if n >= bufferFlushSize {
+                               if w.err != nil {
+                                       n = 0
+                                       return
+                               }
+                               w.write(w.bytes[:n])
+                               n = 0
+                       }
+                       w.nbytes = n
                }
-               n = 0
        }
-       w.nbytes = n
-       w.writeCode(encoding[endBlockMarker])
+       if eof || sync {
+               w.writeCode(encoding[endBlockMarker])
+               w.lastHeader = 0
+               w.lastHuffMan = false
+       }
 }
index bdcbd823b00a79efd79fa9ef2a040f7c01d57edb..d0099599c511c73150c4d3419ec0f85948197b42 100644 (file)
@@ -6,9 +6,16 @@ package flate
 
 import (
        "math"
+       "math/bits"
        "sort"
 )
 
+const (
+       maxBitsLimit = 16
+       // number of valid literals
+       literalCount = 286
+)
+
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
        code, len uint16
@@ -24,7 +31,7 @@ type huffmanEncoder struct {
 
 type literalNode struct {
        literal uint16
-       freq    int32
+       freq    uint16
 }
 
 // A levelInfo describes the state of the constructed tree for a given depth.
@@ -53,18 +60,24 @@ func (h *hcode) set(code uint16, length uint16) {
        h.code = code
 }
 
-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+func reverseBits(number uint16, bitLength byte) uint16 {
+       return bits.Reverse16(number << ((16 - bitLength) & 15))
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
 
 func newHuffmanEncoder(size int) *huffmanEncoder {
-       return &huffmanEncoder{codes: make([]hcode, size)}
+       // Make capacity to next power of two.
+       c := uint(bits.Len32(uint32(size - 1)))
+       return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
 }
 
 // Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-       h := newHuffmanEncoder(maxNumLit)
+       h := newHuffmanEncoder(literalCount)
        codes := h.codes
        var ch uint16
-       for ch = 0; ch < maxNumLit; ch++ {
+       for ch = 0; ch < literalCount; ch++ {
                var bits uint16
                var size uint16
                switch {
@@ -105,7 +118,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
 var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
 
-func (h *huffmanEncoder) bitLength(freq []int32) int {
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
        var total int
        for i, f := range freq {
                if f != 0 {
@@ -115,8 +128,6 @@ func (h *huffmanEncoder) bitLength(freq []int32) int {
        return total
 }
 
-const maxBitsLimit = 16
-
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@@ -160,9 +171,9 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
                // We initialize the levels as if we had already figured this out.
                levels[level] = levelInfo{
                        level:        level,
-                       lastFreq:     list[1].freq,
-                       nextCharFreq: list[2].freq,
-                       nextPairFreq: list[0].freq + list[1].freq,
+                       lastFreq:     int32(list[1].freq),
+                       nextCharFreq: int32(list[2].freq),
+                       nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
                }
                leafCounts[level][level] = 2
                if level == 1 {
@@ -194,7 +205,12 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
                        l.lastFreq = l.nextCharFreq
                        // Lower leafCounts are the same of the previous node.
                        leafCounts[level][level] = n
-                       l.nextCharFreq = list[n].freq
+                       e := list[n]
+                       if e.literal < math.MaxUint16 {
+                               l.nextCharFreq = int32(e.freq)
+                       } else {
+                               l.nextCharFreq = math.MaxInt32
+                       }
                } else {
                        // The next item on this row is a pair from the previous row.
                        // nextPairFreq isn't valid until we generate two
@@ -270,12 +286,12 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 //
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
        if h.freqcache == nil {
                // Allocate a reusable buffer with the longest possible frequency table.
-               // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
-               // The largest of these is maxNumLit, so we allocate for that case.
-               h.freqcache = make([]literalNode, maxNumLit+1)
+               // Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+               // The largest of these is literalCount, so we allocate for that case.
+               h.freqcache = make([]literalNode, literalCount+1)
        }
        list := h.freqcache[:len(freq)+1]
        // Number of non-zero literals
@@ -342,3 +358,27 @@ func (s byFreq) Less(i, j int) bool {
 }
 
 func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+// histogramSize accumulates a histogram of b in h.
+// An estimated size in bits is returned.
+// Unassigned values are assigned '1' in the histogram.
+// len(h) must be >= 256, and h's elements must be all zeroes.
+func histogramSize(b []byte, h []uint16, fill bool) int {
+       h = h[:256]
+       for _, t := range b {
+               h[t]++
+       }
+       invTotal := 1.0 / float64(len(b))
+       shannon := 0.0
+       single := math.Ceil(-math.Log2(invTotal))
+       for i, v := range h[:] {
+               if v > 0 {
+                       n := float64(v)
+                       shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+               } else if fill {
+                       shannon += single
+                       h[i] = 1
+               }
+       }
+       return int(shannon + 0.99)
+}
index 53b63d9a0b3e5eb07364efb39eed479393131d24..6dc5b5d06e3037440c76e0e82527b5562905a2f9 100644 (file)
@@ -9,19 +9,24 @@ package flate
 
 import (
        "bufio"
+       "fmt"
        "io"
+       "math/bits"
        "strconv"
        "sync"
 )
 
 const (
-       maxCodeLen = 16 // max length of Huffman code
+       maxCodeLen     = 16 // max length of Huffman code
+       maxCodeLenMask = 15 // mask for max length of Huffman code
        // The next three numbers come from the RFC section 3.2.7, with the
        // additional proviso in section 3.2.5 which implies that distance codes
        // 30 and 31 should never occur in compressed data.
        maxNumLit  = 286
        maxNumDist = 30
        numCodes   = 19 // number of codes in Huffman meta-code
+
+       debugDecode = false
 )
 
 // Initialize the fixedHuffmanDecoder only once upon first use.
@@ -101,10 +106,10 @@ const (
 )
 
 type huffmanDecoder struct {
-       min      int                      // the minimum code length
-       chunks   [huffmanNumChunks]uint32 // chunks as described above
-       links    [][]uint32               // overflow links
-       linkMask uint32                   // mask the width of the link table
+       min      int                       // the minimum code length
+       chunks   *[huffmanNumChunks]uint16 // chunks as described above
+       links    [][]uint16                // overflow links
+       linkMask uint32                    // mask the width of the link table
 }
 
 // Initialize Huffman decoding tables from array of code lengths.
@@ -112,21 +117,24 @@ type huffmanDecoder struct {
 // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
 // degenerate case where the tree has only a single symbol with length 1. Empty
 // trees are permitted.
-func (h *huffmanDecoder) init(bits []int) bool {
+func (h *huffmanDecoder) init(lengths []int) bool {
        // Sanity enables additional runtime tests during Huffman
        // table construction. It's intended to be used during
        // development to supplement the currently ad-hoc unit tests.
        const sanity = false
 
+       if h.chunks == nil {
+               h.chunks = &[huffmanNumChunks]uint16{}
+       }
        if h.min != 0 {
-               *h = huffmanDecoder{}
+               *h = huffmanDecoder{chunks: h.chunks, links: h.links}
        }
 
        // Count number of codes of each length,
        // compute min and max length.
        var count [maxCodeLen]int
        var min, max int
-       for _, n := range bits {
+       for _, n := range lengths {
                if n == 0 {
                        continue
                }
@@ -136,7 +144,7 @@ func (h *huffmanDecoder) init(bits []int) bool {
                if n > max {
                        max = n
                }
-               count[n]++
+               count[n&maxCodeLenMask]++
        }
 
        // Empty tree. The decompressor.huffSym function will fail later if the tree
@@ -154,8 +162,8 @@ func (h *huffmanDecoder) init(bits []int) bool {
        var nextcode [maxCodeLen]int
        for i := min; i <= max; i++ {
                code <<= 1
-               nextcode[i] = code
-               code += count[i]
+               nextcode[i&maxCodeLenMask] = code
+               code += count[i&maxCodeLenMask]
        }
 
        // Check that the coding is complete (i.e., that we've
@@ -164,37 +172,56 @@ func (h *huffmanDecoder) init(bits []int) bool {
        // accept degenerate single-code codings. See also
        // TestDegenerateHuffmanCoding.
        if code != 1<<uint(max) && !(code == 1 && max == 1) {
+               if debugDecode {
+                       fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
+               }
                return false
        }
 
        h.min = min
+       chunks := h.chunks[:]
+       for i := range chunks {
+               chunks[i] = 0
+       }
+
        if max > huffmanChunkBits {
                numLinks := 1 << (uint(max) - huffmanChunkBits)
                h.linkMask = uint32(numLinks - 1)
 
                // create link tables
                link := nextcode[huffmanChunkBits+1] >> 1
-               h.links = make([][]uint32, huffmanNumChunks-link)
+               if cap(h.links) < huffmanNumChunks-link {
+                       h.links = make([][]uint16, huffmanNumChunks-link)
+               } else {
+                       h.links = h.links[:huffmanNumChunks-link]
+               }
                for j := uint(link); j < huffmanNumChunks; j++ {
-                       reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+                       reverse := int(bits.Reverse16(uint16(j)))
                        reverse >>= uint(16 - huffmanChunkBits)
                        off := j - uint(link)
                        if sanity && h.chunks[reverse] != 0 {
                                panic("impossible: overwriting existing chunk")
                        }
-                       h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
-                       h.links[off] = make([]uint32, numLinks)
+                       h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+                       if cap(h.links[off]) < numLinks {
+                               h.links[off] = make([]uint16, numLinks)
+                       } else {
+                               links := h.links[off][:0]
+                               h.links[off] = links[:numLinks]
+                       }
                }
+       } else {
+               h.links = h.links[:0]
        }
 
-       for i, n := range bits {
+       for i, n := range lengths {
                if n == 0 {
                        continue
                }
                code := nextcode[n]
                nextcode[n]++
-               chunk := uint32(i<<huffmanValueShift | n)
-               reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
+               chunk := uint16(i<<huffmanValueShift | n)
+               reverse := int(bits.Reverse16(uint16(code)))
                reverse >>= uint(16 - n)
                if n <= huffmanChunkBits {
                        for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
@@ -326,6 +353,9 @@ func (f *decompressor) nextBlock() {
                f.huffmanBlock()
        default:
                // 3 is reserved.
+               if debugDecode {
+                       fmt.Println("reserved data block encountered")
+               }
                f.err = CorruptInputError(f.roffset)
        }
 }
@@ -404,11 +434,17 @@ func (f *decompressor) readHuffman() error {
        }
        nlit := int(f.b&0x1F) + 257
        if nlit > maxNumLit {
+               if debugDecode {
+                       fmt.Println("nlit > maxNumLit", nlit)
+               }
                return CorruptInputError(f.roffset)
        }
        f.b >>= 5
        ndist := int(f.b&0x1F) + 1
        if ndist > maxNumDist {
+               if debugDecode {
+                       fmt.Println("ndist > maxNumDist", ndist)
+               }
                return CorruptInputError(f.roffset)
        }
        f.b >>= 5
@@ -432,6 +468,9 @@ func (f *decompressor) readHuffman() error {
                f.codebits[codeOrder[i]] = 0
        }
        if !f.h1.init(f.codebits[0:]) {
+               if debugDecode {
+                       fmt.Println("init codebits failed")
+               }
                return CorruptInputError(f.roffset)
        }
 
@@ -459,6 +498,9 @@ func (f *decompressor) readHuffman() error {
                        rep = 3
                        nb = 2
                        if i == 0 {
+                               if debugDecode {
+                                       fmt.Println("i==0")
+                               }
                                return CorruptInputError(f.roffset)
                        }
                        b = f.bits[i-1]
@@ -473,6 +515,9 @@ func (f *decompressor) readHuffman() error {
                }
                for f.nb < nb {
                        if err := f.moreBits(); err != nil {
+                               if debugDecode {
+                                       fmt.Println("morebits:", err)
+                               }
                                return err
                        }
                }
@@ -480,6 +525,9 @@ func (f *decompressor) readHuffman() error {
                f.b >>= nb
                f.nb -= nb
                if i+rep > n {
+                       if debugDecode {
+                               fmt.Println("i+rep > n", i, rep, n)
+                       }
                        return CorruptInputError(f.roffset)
                }
                for j := 0; j < rep; j++ {
@@ -489,6 +537,9 @@ func (f *decompressor) readHuffman() error {
        }
 
        if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+               if debugDecode {
+                       fmt.Println("init2 failed")
+               }
                return CorruptInputError(f.roffset)
        }
 
@@ -566,12 +617,18 @@ readLiteral:
                        length = 258
                        n = 0
                default:
+                       if debugDecode {
+                               fmt.Println(v, ">= maxNumLit")
+                       }
                        f.err = CorruptInputError(f.roffset)
                        return
                }
                if n > 0 {
                        for f.nb < n {
                                if err = f.moreBits(); err != nil {
+                                       if debugDecode {
+                                               fmt.Println("morebits n>0:", err)
+                                       }
                                        f.err = err
                                        return
                                }
@@ -585,15 +642,21 @@ readLiteral:
                if f.hd == nil {
                        for f.nb < 5 {
                                if err = f.moreBits(); err != nil {
+                                       if debugDecode {
+                                               fmt.Println("morebits f.nb<5:", err)
+                                       }
                                        f.err = err
                                        return
                                }
                        }
-                       dist = int(reverseByte[(f.b&0x1F)<<3])
+                       dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
                        f.b >>= 5
                        f.nb -= 5
                } else {
                        if dist, err = f.huffSym(f.hd); err != nil {
+                               if debugDecode {
+                                       fmt.Println("huffsym:", err)
+                               }
                                f.err = err
                                return
                        }
@@ -608,6 +671,9 @@ readLiteral:
                        extra := (dist & 1) << nb
                        for f.nb < nb {
                                if err = f.moreBits(); err != nil {
+                                       if debugDecode {
+                                               fmt.Println("morebits f.nb<nb:", err)
+                                       }
                                        f.err = err
                                        return
                                }
@@ -617,12 +683,18 @@ readLiteral:
                        f.nb -= nb
                        dist = 1<<(nb+1) + 1 + extra
                default:
+                       if debugDecode {
+                               fmt.Println("dist too big:", dist, maxNumDist)
+                       }
                        f.err = CorruptInputError(f.roffset)
                        return
                }
 
                // No check on length; encoding can be prescient.
                if dist > f.dict.histSize() {
+                       if debugDecode {
+                               fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+                       }
                        f.err = CorruptInputError(f.roffset)
                        return
                }
@@ -661,15 +733,15 @@ func (f *decompressor) dataBlock() {
        nr, err := io.ReadFull(f.r, f.buf[0:4])
        f.roffset += int64(nr)
        if err != nil {
-               if err == io.EOF {
-                       err = io.ErrUnexpectedEOF
-               }
-               f.err = err
+               f.err = noEOF(err)
                return
        }
        n := int(f.buf[0]) | int(f.buf[1])<<8
        nn := int(f.buf[2]) | int(f.buf[3])<<8
        if uint16(nn) != uint16(^n) {
+               if debugDecode {
+                       fmt.Println("uint16(nn) != uint16(^n)", nn, ^n)
+               }
                f.err = CorruptInputError(f.roffset)
                return
        }
@@ -697,10 +769,7 @@ func (f *decompressor) copyData() {
        f.copyLen -= cnt
        f.dict.writeMark(cnt)
        if err != nil {
-               if err == io.EOF {
-                       err = io.ErrUnexpectedEOF
-               }
-               f.err = err
+               f.err = noEOF(err)
                return
        }
 
@@ -722,13 +791,18 @@ func (f *decompressor) finishBlock() {
        f.step = (*decompressor).nextBlock
 }
 
+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+       if e == io.EOF {
+               return io.ErrUnexpectedEOF
+       }
+       return e
+}
+
 func (f *decompressor) moreBits() error {
        c, err := f.r.ReadByte()
        if err != nil {
-               if err == io.EOF {
-                       err = io.ErrUnexpectedEOF
-               }
-               return err
+               return noEOF(err)
        }
        f.roffset++
        f.b |= uint32(c) << f.nb
@@ -743,25 +817,40 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
        // cases, the chunks slice will be 0 for the invalid sequence, leading it
        // satisfy the n == 0 check below.
        n := uint(h.min)
+       // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+       // but is smart enough to keep local variables in registers, so use nb and b,
+       // inline call to moreBits and reassign b,nb back to f on return.
+       nb, b := f.nb, f.b
        for {
-               for f.nb < n {
-                       if err := f.moreBits(); err != nil {
-                               return 0, err
+               for nb < n {
+                       c, err := f.r.ReadByte()
+                       if err != nil {
+                               f.b = b
+                               f.nb = nb
+                               return 0, noEOF(err)
                        }
+                       f.roffset++
+                       b |= uint32(c) << (nb & 31)
+                       nb += 8
                }
-               chunk := h.chunks[f.b&(huffmanNumChunks-1)]
+               chunk := h.chunks[b&(huffmanNumChunks-1)]
                n = uint(chunk & huffmanCountMask)
                if n > huffmanChunkBits {
-                       chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
+                       chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
                        n = uint(chunk & huffmanCountMask)
                }
-               if n <= f.nb {
+               if n <= nb {
                        if n == 0 {
+                               f.b = b
+                               f.nb = nb
+                               if debugDecode {
+                                       fmt.Println("huffsym: n==0")
+                               }
                                f.err = CorruptInputError(f.roffset)
                                return 0, f.err
                        }
-                       f.b >>= n
-                       f.nb -= n
+                       f.b = b >> (n & 31)
+                       f.nb = nb - n
                        return int(chunk >> huffmanValueShift), nil
                }
        }
@@ -799,6 +888,8 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
                r:        makeReader(r),
                bits:     f.bits,
                codebits: f.codebits,
+               h1:       f.h1,
+               h2:       f.h2,
                dict:     f.dict,
                step:     (*decompressor).nextBlock,
        }
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
new file mode 100644 (file)
index 0000000..20de8f1
--- /dev/null
@@ -0,0 +1,174 @@
+package flate
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL1 struct {
+       fastGen
+       table [tableSize]tableEntry
+}
+
+// EncodeL1 uses a similar algorithm to level 1
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntry{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.table[i].offset = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load3232(src, s)
+
+       for {
+               const skipLog = 5
+               const doEvery = 2
+
+               nextS := s
+               var candidate tableEntry
+               for {
+                       nextHash := hash(cv)
+                       candidate = e.table[nextHash]
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+
+                       now := load6432(src, nextS)
+                       e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+                       nextHash = hash(uint32(now))
+
+                       offset := s - (candidate.offset - e.cur)
+                       if offset < maxMatchOffset && cv == candidate.val {
+                               e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+                               break
+                       }
+
+                       // Do one right away...
+                       cv = uint32(now)
+                       s = nextS
+                       nextS++
+                       candidate = e.table[nextHash]
+                       now >>= 8
+                       e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+
+                       offset = s - (candidate.offset - e.cur)
+                       if offset < maxMatchOffset && cv == candidate.val {
+                               e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+                               break
+                       }
+                       cv = uint32(now)
+                       s = nextS
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+               for {
+                       // Invariant: we have a 4-byte match at s, and no need to emit any
+                       // literal bytes prior to s.
+
+                       // Extend the 4-byte match as long as possible.
+                       t := candidate.offset - e.cur
+                       l := e.matchlenLong(s+4, t+4, src) + 4
+
+                       // Extend backwards
+                       for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                               s--
+                               t--
+                               l++
+                       }
+                       if nextEmit < s {
+                               emitLiteral(dst, src[nextEmit:s])
+                       }
+
+                       // Save the match found
+                       dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+                       s += l
+                       nextEmit = s
+                       if nextS >= s {
+                               s = nextS + 1
+                       }
+                       if s >= sLimit {
+                               // Index first pair after match end.
+                               if int(s+l+4) < len(src) {
+                                       cv := load3232(src, s)
+                                       e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv}
+                               }
+                               goto emitRemainder
+                       }
+
+                       // We could immediately start working at s now, but to improve
+                       // compression we first update the hash table at s-2 and at s. If
+                       // another emitCopy is not our next move, also calculate nextHash
+                       // at s+1. At least on GOARCH=amd64, these three hash calculations
+                       // are faster as one load64 call (with some shifts) instead of
+                       // three load32 calls.
+                       x := load6432(src, s-2)
+                       o := e.cur + s - 2
+                       prevHash := hash(uint32(x))
+                       e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+                       x >>= 16
+                       currHash := hash(uint32(x))
+                       candidate = e.table[currHash]
+                       e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)}
+
+                       offset := s - (candidate.offset - e.cur)
+                       if offset > maxMatchOffset || uint32(x) != candidate.val {
+                               cv = uint32(x >> 8)
+                               s++
+                               break
+                       }
+               }
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
new file mode 100644 (file)
index 0000000..7c82443
--- /dev/null
@@ -0,0 +1,199 @@
+package flate
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL2 struct {
+       fastGen
+       table [bTableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntry{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.table[i].offset = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load3232(src, s)
+       for {
+               // When should we start skipping if we haven't found matches in a long while.
+               const skipLog = 5
+               const doEvery = 2
+
+               nextS := s
+               var candidate tableEntry
+               for {
+                       nextHash := hash4u(cv, bTableBits)
+                       s = nextS
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       candidate = e.table[nextHash]
+                       now := load6432(src, nextS)
+                       e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+                       nextHash = hash4u(uint32(now), bTableBits)
+
+                       offset := s - (candidate.offset - e.cur)
+                       if offset < maxMatchOffset && cv == candidate.val {
+                               e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+                               break
+                       }
+
+                       // Do one right away...
+                       cv = uint32(now)
+                       s = nextS
+                       nextS++
+                       candidate = e.table[nextHash]
+                       now >>= 8
+                       e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+
+                       offset = s - (candidate.offset - e.cur)
+                       if offset < maxMatchOffset && cv == candidate.val {
+                               break
+                       }
+                       cv = uint32(now)
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+
+               // Call emitCopy, and then see if another emitCopy could be our next
+               // move. Repeat until we find no match for the input immediately after
+               // what was consumed by the last emitCopy call.
+               //
+               // If we exit this loop normally then we need to call emitLiteral next,
+               // though we don't yet know how big the literal will be. We handle that
+               // by proceeding to the next iteration of the main loop. We also can
+               // exit this loop via goto if we get close to exhausting the input.
+               for {
+                       // Invariant: we have a 4-byte match at s, and no need to emit any
+                       // literal bytes prior to s.
+
+                       // Extend the 4-byte match as long as possible.
+                       t := candidate.offset - e.cur
+                       l := e.matchlenLong(s+4, t+4, src) + 4
+
+                       // Extend backwards
+                       for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                               s--
+                               t--
+                               l++
+                       }
+                       if nextEmit < s {
+                               emitLiteral(dst, src[nextEmit:s])
+                       }
+
+                       dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+                       s += l
+                       nextEmit = s
+                       if nextS >= s {
+                               s = nextS + 1
+                       }
+
+                       if s >= sLimit {
+                               // Index first pair after match end.
+                               if int(s+l+4) < len(src) {
+                                       cv := load3232(src, s)
+                                       e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv}
+                               }
+                               goto emitRemainder
+                       }
+
+                       // Store every second hash in-between, but offset by 1.
+                       for i := s - l + 2; i < s-5; i += 7 {
+                               x := load6432(src, int32(i))
+                               nextHash := hash4u(uint32(x), bTableBits)
+                               e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)}
+                               // Skip one
+                               x >>= 16
+                               nextHash = hash4u(uint32(x), bTableBits)
+                               e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)}
+                               // Skip one
+                               x >>= 16
+                               nextHash = hash4u(uint32(x), bTableBits)
+                               e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)}
+                       }
+
+                       // We could immediately start working at s now, but to improve
+                       // compression we first update the hash table at s-2 to s. If
+                       // another emitCopy is not our next move, also calculate nextHash
+                       // at s+1. At least on GOARCH=amd64, these three hash calculations
+                       // are faster as one load64 call (with some shifts) instead of
+                       // three load32 calls.
+                       x := load6432(src, s-2)
+                       o := e.cur + s - 2
+                       prevHash := hash4u(uint32(x), bTableBits)
+                       prevHash2 := hash4u(uint32(x>>8), bTableBits)
+                       e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+                       e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)}
+                       currHash := hash4u(uint32(x>>16), bTableBits)
+                       candidate = e.table[currHash]
+                       e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)}
+
+                       offset := s - (candidate.offset - e.cur)
+                       if offset > maxMatchOffset || uint32(x>>16) != candidate.val {
+                               cv = uint32(x >> 24)
+                               s++
+                               break
+                       }
+               }
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
new file mode 100644 (file)
index 0000000..4153d24
--- /dev/null
@@ -0,0 +1,225 @@
+package flate
+
+// fastEncL3
+type fastEncL3 struct {
+       fastGen
+       table [tableSize]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 8 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntryPrev{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i]
+                       if v.Cur.offset <= minOff {
+                               v.Cur.offset = 0
+                       } else {
+                               v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+                       }
+                       if v.Prev.offset <= minOff {
+                               v.Prev.offset = 0
+                       } else {
+                               v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+                       }
+                       e.table[i] = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // Skip if too small.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load3232(src, s)
+       for {
+               const skipLog = 6
+               nextS := s
+               var candidate tableEntry
+               for {
+                       nextHash := hash(cv)
+                       s = nextS
+                       nextS = s + 1 + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       candidates := e.table[nextHash]
+                       now := load3232(src, nextS)
+                       e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+
+                       // Check both candidates
+                       candidate = candidates.Cur
+                       offset := s - (candidate.offset - e.cur)
+                       if cv == candidate.val {
+                               if offset > maxMatchOffset {
+                                       cv = now
+                                       // Previous will also be invalid, we have nothing.
+                                       continue
+                               }
+                               o2 := s - (candidates.Prev.offset - e.cur)
+                               if cv != candidates.Prev.val || o2 > maxMatchOffset {
+                                       break
+                               }
+                               // Both match and are valid, pick longest.
+                               l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+                               if l2 > l1 {
+                                       candidate = candidates.Prev
+                               }
+                               break
+                       } else {
+                               // We only check if value mismatches.
+                               // Offset will always be invalid in other cases.
+                               candidate = candidates.Prev
+                               if cv == candidate.val {
+                                       offset := s - (candidate.offset - e.cur)
+                                       if offset <= maxMatchOffset {
+                                               break
+                                       }
+                               }
+                       }
+                       cv = now
+               }
+
+               // Call emitCopy, and then see if another emitCopy could be our next
+               // move. Repeat until we find no match for the input immediately after
+               // what was consumed by the last emitCopy call.
+               //
+               // If we exit this loop normally then we need to call emitLiteral next,
+               // though we don't yet know how big the literal will be. We handle that
+               // by proceeding to the next iteration of the main loop. We also can
+               // exit this loop via goto if we get close to exhausting the input.
+               for {
+                       // Invariant: we have a 4-byte match at s, and no need to emit any
+                       // literal bytes prior to s.
+
+                       // Extend the 4-byte match as long as possible.
+                       //
+                       t := candidate.offset - e.cur
+                       l := e.matchlenLong(s+4, t+4, src) + 4
+
+                       // Extend backwards
+                       for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                               s--
+                               t--
+                               l++
+                       }
+                       if nextEmit < s {
+                               emitLiteral(dst, src[nextEmit:s])
+                       }
+
+                       dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+                       s += l
+                       nextEmit = s
+                       if nextS >= s {
+                               s = nextS + 1
+                       }
+
+                       if s >= sLimit {
+                               t += l
+                               // Index first pair after match end.
+                               if int(t+4) < len(src) && t > 0 {
+                                       cv := load3232(src, t)
+                                       nextHash := hash(cv)
+                                       e.table[nextHash] = tableEntryPrev{
+                                               Prev: e.table[nextHash].Cur,
+                                               Cur:  tableEntry{offset: e.cur + t, val: cv},
+                                       }
+                               }
+                               goto emitRemainder
+                       }
+
+                       // We could immediately start working at s now, but to improve
+                       // compression we first update the hash table at s-3 to s.
+                       x := load6432(src, s-3)
+                       prevHash := hash(uint32(x))
+                       e.table[prevHash] = tableEntryPrev{
+                               Prev: e.table[prevHash].Cur,
+                               Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+                       }
+                       x >>= 8
+                       prevHash = hash(uint32(x))
+
+                       e.table[prevHash] = tableEntryPrev{
+                               Prev: e.table[prevHash].Cur,
+                               Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+                       }
+                       x >>= 8
+                       prevHash = hash(uint32(x))
+
+                       e.table[prevHash] = tableEntryPrev{
+                               Prev: e.table[prevHash].Cur,
+                               Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+                       }
+                       x >>= 8
+                       currHash := hash(uint32(x))
+                       candidates := e.table[currHash]
+                       cv = uint32(x)
+                       e.table[currHash] = tableEntryPrev{
+                               Prev: candidates.Cur,
+                               Cur:  tableEntry{offset: s + e.cur, val: cv},
+                       }
+
+                       // Check both candidates
+                       candidate = candidates.Cur
+                       if cv == candidate.val {
+                               offset := s - (candidate.offset - e.cur)
+                               if offset <= maxMatchOffset {
+                                       continue
+                               }
+                       } else {
+                               // We only check if value mismatches.
+                               // Offset will always be invalid in other cases.
+                               candidate = candidates.Prev
+                               if cv == candidate.val {
+                                       offset := s - (candidate.offset - e.cur)
+                                       if offset <= maxMatchOffset {
+                                               continue
+                                       }
+                               }
+                       }
+                       cv = uint32(x >> 8)
+                       s++
+                       break
+               }
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
new file mode 100644 (file)
index 0000000..c689ac7
--- /dev/null
@@ -0,0 +1,210 @@
+package flate
+
+import "fmt"
+
+type fastEncL4 struct {
+       fastGen
+       table  [tableSize]tableEntry
+       bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntry{}
+                       }
+                       for i := range e.bTable[:] {
+                               e.bTable[i] = tableEntry{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.table[i].offset = v
+               }
+               for i := range e.bTable[:] {
+                       v := e.bTable[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.bTable[i].offset = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load6432(src, s)
+       for {
+               const skipLog = 6
+               const doEvery = 1
+
+               nextS := s
+               var t int32
+               for {
+                       nextHashS := hash4x64(cv, tableBits)
+                       nextHashL := hash7(cv, tableBits)
+
+                       s = nextS
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       // Fetch a short+long candidate
+                       sCandidate := e.table[nextHashS]
+                       lCandidate := e.bTable[nextHashL]
+                       next := load6432(src, nextS)
+                       entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+                       e.table[nextHashS] = entry
+                       e.bTable[nextHashL] = entry
+
+                       t = lCandidate.offset - e.cur
+                       if s-t < maxMatchOffset && uint32(cv) == lCandidate.val {
+                               // We got a long match. Use that.
+                               break
+                       }
+
+                       t = sCandidate.offset - e.cur
+                       if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+                               // Found a 4 match...
+                               lCandidate = e.bTable[hash7(next, tableBits)]
+
+                               // If the next long is a candidate, check if we should use that instead...
+                               lOff := nextS - (lCandidate.offset - e.cur)
+                               if lOff < maxMatchOffset && lCandidate.val == uint32(next) {
+                                       l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+                                       if l2 > l1 {
+                                               s = nextS
+                                               t = lCandidate.offset - e.cur
+                                       }
+                               }
+                               break
+                       }
+                       cv = next
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+
+               // Extend the 4-byte match as long as possible.
+               l := e.matchlenLong(s+4, t+4, src) + 4
+
+               // Extend backwards
+               for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                       s--
+                       t--
+                       l++
+               }
+               if nextEmit < s {
+                       emitLiteral(dst, src[nextEmit:s])
+               }
+               if false {
+                       if t >= s {
+                               panic("s-t")
+                       }
+                       if (s - t) > maxMatchOffset {
+                               panic(fmt.Sprintln("mmo", t))
+                       }
+                       if l < baseMatchLength {
+                               panic("bml")
+                       }
+               }
+
+               dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+               s += l
+               nextEmit = s
+               if nextS >= s {
+                       s = nextS + 1
+               }
+
+               if s >= sLimit {
+                       // Index first pair after match end.
+                       if int(s+8) < len(src) {
+                               cv := load6432(src, s)
+                               e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+                               e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+                       }
+                       goto emitRemainder
+               }
+
+               // Store every 3rd hash in-between
+               if true {
+                       i := nextS
+                       if i < s-1 {
+                               cv := load6432(src, i)
+                               t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+                               t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+                               e.bTable[hash7(cv, tableBits)] = t
+                               e.bTable[hash7(cv>>8, tableBits)] = t2
+                               e.table[hash4u(t2.val, tableBits)] = t2
+
+                               i += 3
+                               for ; i < s-1; i += 3 {
+                                       cv := load6432(src, i)
+                                       t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+                                       t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+                                       e.bTable[hash7(cv, tableBits)] = t
+                                       e.bTable[hash7(cv>>8, tableBits)] = t2
+                                       e.table[hash4u(t2.val, tableBits)] = t2
+                               }
+                       }
+               }
+
+               // We could immediately start working at s now, but to improve
+               // compression we first update the hash table at s-1 and at s.
+               x := load6432(src, s-1)
+               o := e.cur + s - 1
+               prevHashS := hash4x64(x, tableBits)
+               prevHashL := hash7(x, tableBits)
+               e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+               e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)}
+               cv = x >> 8
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
new file mode 100644 (file)
index 0000000..14a2356
--- /dev/null
@@ -0,0 +1,276 @@
+package flate
+
+import "fmt"
+
+type fastEncL5 struct {
+       fastGen
+       table  [tableSize]tableEntry
+       bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntry{}
+                       }
+                       for i := range e.bTable[:] {
+                               e.bTable[i] = tableEntryPrev{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.table[i].offset = v
+               }
+               for i := range e.bTable[:] {
+                       v := e.bTable[i]
+                       if v.Cur.offset <= minOff {
+                               v.Cur.offset = 0
+                               v.Prev.offset = 0
+                       } else {
+                               v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+                               if v.Prev.offset <= minOff {
+                                       v.Prev.offset = 0
+                               } else {
+                                       v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+                               }
+                       }
+                       e.bTable[i] = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load6432(src, s)
+       for {
+               const skipLog = 6
+               const doEvery = 1
+
+               nextS := s
+               var l int32
+               var t int32
+               for {
+                       nextHashS := hash4x64(cv, tableBits)
+                       nextHashL := hash7(cv, tableBits)
+
+                       s = nextS
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       // Fetch a short+long candidate
+                       sCandidate := e.table[nextHashS]
+                       lCandidate := e.bTable[nextHashL]
+                       next := load6432(src, nextS)
+                       entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+                       e.table[nextHashS] = entry
+                       eLong := &e.bTable[nextHashL]
+                       eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+                       nextHashS = hash4x64(next, tableBits)
+                       nextHashL = hash7(next, tableBits)
+
+                       t = lCandidate.Cur.offset - e.cur
+                       if s-t < maxMatchOffset {
+                               if uint32(cv) == lCandidate.Cur.val {
+                                       // Store the next match
+                                       e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                                       eLong := &e.bTable[nextHashL]
+                                       eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+                                       t2 := lCandidate.Prev.offset - e.cur
+                                       if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+                                               l = e.matchlen(s+4, t+4, src) + 4
+                                               ml1 := e.matchlen(s+4, t2+4, src) + 4
+                                               if ml1 > l {
+                                                       t = t2
+                                                       l = ml1
+                                                       break
+                                               }
+                                       }
+                                       break
+                               }
+                               t = lCandidate.Prev.offset - e.cur
+                               if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+                                       // Store the next match
+                                       e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                                       eLong := &e.bTable[nextHashL]
+                                       eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+                                       break
+                               }
+                       }
+
+                       t = sCandidate.offset - e.cur
+                       if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+                               // Found a 4 match...
+                               l = e.matchlen(s+4, t+4, src) + 4
+                               lCandidate = e.bTable[nextHashL]
+                               // Store the next match
+
+                               e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                               eLong := &e.bTable[nextHashL]
+                               eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+                               // If the next long is a candidate, use that...
+                               t2 := lCandidate.Cur.offset - e.cur
+                               if nextS-t2 < maxMatchOffset {
+                                       if lCandidate.Cur.val == uint32(next) {
+                                               ml := e.matchlen(nextS+4, t2+4, src) + 4
+                                               if ml > l {
+                                                       t = t2
+                                                       s = nextS
+                                                       l = ml
+                                                       break
+                                               }
+                                       }
+                                       // If the previous long is a candidate, use that...
+                                       t2 = lCandidate.Prev.offset - e.cur
+                                       if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+                                               ml := e.matchlen(nextS+4, t2+4, src) + 4
+                                               if ml > l {
+                                                       t = t2
+                                                       s = nextS
+                                                       l = ml
+                                                       break
+                                               }
+                                       }
+                               }
+                               break
+                       }
+                       cv = next
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+
+               // Extend the 4-byte match as long as possible.
+               if l == 0 {
+                       l = e.matchlenLong(s+4, t+4, src) + 4
+               } else if l == maxMatchLength {
+                       l += e.matchlenLong(s+l, t+l, src)
+               }
+               // Extend backwards
+               for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                       s--
+                       t--
+                       l++
+               }
+               if nextEmit < s {
+                       emitLiteral(dst, src[nextEmit:s])
+               }
+               if false {
+                       if t >= s {
+                               panic(fmt.Sprintln("s-t", s, t))
+                       }
+                       if (s - t) > maxMatchOffset {
+                               panic(fmt.Sprintln("mmo", s-t))
+                       }
+                       if l < baseMatchLength {
+                               panic("bml")
+                       }
+               }
+
+               dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+               s += l
+               nextEmit = s
+               if nextS >= s {
+                       s = nextS + 1
+               }
+
+               if s >= sLimit {
+                       goto emitRemainder
+               }
+
+               // Store every 3rd hash in-between.
+               if true {
+                       const hashEvery = 3
+                       i := s - l + 1
+                       if i < s-1 {
+                               cv := load6432(src, i)
+                               t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+                               e.table[hash4x64(cv, tableBits)] = t
+                               eLong := &e.bTable[hash7(cv, tableBits)]
+                               eLong.Cur, eLong.Prev = t, eLong.Cur
+
+                               // Do an long at i+1
+                               cv >>= 8
+                               t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+                               eLong = &e.bTable[hash7(cv, tableBits)]
+                               eLong.Cur, eLong.Prev = t, eLong.Cur
+
+                               // We only have enough bits for a short entry at i+2
+                               cv >>= 8
+                               t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+                               e.table[hash4x64(cv, tableBits)] = t
+
+                               // Skip one - otherwise we risk hitting 's'
+                               i += 4
+                               for ; i < s-1; i += hashEvery {
+                                       cv := load6432(src, i)
+                                       t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+                                       t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+                                       eLong := &e.bTable[hash7(cv, tableBits)]
+                                       eLong.Cur, eLong.Prev = t, eLong.Cur
+                                       e.table[hash4u(t2.val, tableBits)] = t2
+                               }
+                       }
+               }
+
+               // We could immediately start working at s now, but to improve
+               // compression we first update the hash table at s-1 and at s.
+               x := load6432(src, s-1)
+               o := e.cur + s - 1
+               prevHashS := hash4x64(x, tableBits)
+               prevHashL := hash7(x, tableBits)
+               e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+               eLong := &e.bTable[prevHashL]
+               eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur
+               cv = x >> 8
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
new file mode 100644 (file)
index 0000000..cad0c7d
--- /dev/null
@@ -0,0 +1,279 @@
+package flate
+
+import "fmt"
+
+type fastEncL6 struct {
+       fastGen
+       table  [tableSize]tableEntry
+       bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       // Protect against e.cur wraparound.
+       for e.cur >= bufferReset {
+               if len(e.hist) == 0 {
+                       for i := range e.table[:] {
+                               e.table[i] = tableEntry{}
+                       }
+                       for i := range e.bTable[:] {
+                               e.bTable[i] = tableEntryPrev{}
+                       }
+                       e.cur = maxMatchOffset
+                       break
+               }
+               // Shift down everything in the table that isn't already too far away.
+               minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+               for i := range e.table[:] {
+                       v := e.table[i].offset
+                       if v <= minOff {
+                               v = 0
+                       } else {
+                               v = v - e.cur + maxMatchOffset
+                       }
+                       e.table[i].offset = v
+               }
+               for i := range e.bTable[:] {
+                       v := e.bTable[i]
+                       if v.Cur.offset <= minOff {
+                               v.Cur.offset = 0
+                               v.Prev.offset = 0
+                       } else {
+                               v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+                               if v.Prev.offset <= minOff {
+                                       v.Prev.offset = 0
+                               } else {
+                                       v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+                               }
+                       }
+                       e.bTable[i] = v
+               }
+               e.cur = maxMatchOffset
+       }
+
+       s := e.addBlock(src)
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       // Override src
+       src = e.hist
+       nextEmit := s
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int32(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load6432(src, s)
+       // Repeat MUST be > 1 and within range
+       repeat := int32(1)
+       for {
+               const skipLog = 7
+               const doEvery = 1
+
+               nextS := s
+               var l int32
+               var t int32
+               for {
+                       nextHashS := hash4x64(cv, tableBits)
+                       nextHashL := hash7(cv, tableBits)
+                       s = nextS
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       // Fetch a short+long candidate
+                       sCandidate := e.table[nextHashS]
+                       lCandidate := e.bTable[nextHashL]
+                       next := load6432(src, nextS)
+                       entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+                       e.table[nextHashS] = entry
+                       eLong := &e.bTable[nextHashL]
+                       eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+                       // Calculate hashes of 'next'
+                       nextHashS = hash4x64(next, tableBits)
+                       nextHashL = hash7(next, tableBits)
+
+                       t = lCandidate.Cur.offset - e.cur
+                       if s-t < maxMatchOffset {
+                               if uint32(cv) == lCandidate.Cur.val {
+                                       // Long candidate matches at least 4 bytes.
+
+                                       // Store the next match
+                                       e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                                       eLong := &e.bTable[nextHashL]
+                                       eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+                                       // Check the previous long candidate as well.
+                                       t2 := lCandidate.Prev.offset - e.cur
+                                       if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+                                               l = e.matchlen(s+4, t+4, src) + 4
+                                               ml1 := e.matchlen(s+4, t2+4, src) + 4
+                                               if ml1 > l {
+                                                       t = t2
+                                                       l = ml1
+                                                       break
+                                               }
+                                       }
+                                       break
+                               }
+                               // Current value did not match, but check if previous long value does.
+                               t = lCandidate.Prev.offset - e.cur
+                               if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+                                       // Store the next match
+                                       e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                                       eLong := &e.bTable[nextHashL]
+                                       eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+                                       break
+                               }
+                       }
+
+                       t = sCandidate.offset - e.cur
+                       if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+                               // Found a 4 match...
+                               l = e.matchlen(s+4, t+4, src) + 4
+
+                               // Look up next long candidate (at nextS)
+                               lCandidate = e.bTable[nextHashL]
+
+                               // Store the next match
+                               e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+                               eLong := &e.bTable[nextHashL]
+                               eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+
+                               // Check repeat at s + repOff
+                               const repOff = 1
+                               t2 := s - repeat + repOff
+                               if load3232(src, t2) == uint32(cv>>(8*repOff)) {
+                                       ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+                                       if ml > l {
+                                               t = t2
+                                               l = ml
+                                               s += repOff
+                                               // Not worth checking more.
+                                               break
+                                       }
+                               }
+
+                               // If the next long is a candidate, use that...
+                               t2 = lCandidate.Cur.offset - e.cur
+                               if nextS-t2 < maxMatchOffset {
+                                       if lCandidate.Cur.val == uint32(next) {
+                                               ml := e.matchlen(nextS+4, t2+4, src) + 4
+                                               if ml > l {
+                                                       t = t2
+                                                       s = nextS
+                                                       l = ml
+                                                       // This is ok, but check previous as well.
+                                               }
+                                       }
+                                       // If the previous long is a candidate, use that...
+                                       t2 = lCandidate.Prev.offset - e.cur
+                                       if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+                                               ml := e.matchlen(nextS+4, t2+4, src) + 4
+                                               if ml > l {
+                                                       t = t2
+                                                       s = nextS
+                                                       l = ml
+                                                       break
+                                               }
+                                       }
+                               }
+                               break
+                       }
+                       cv = next
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+
+               // Extend the 4-byte match as long as possible.
+               if l == 0 {
+                       l = e.matchlenLong(s+4, t+4, src) + 4
+               } else if l == maxMatchLength {
+                       l += e.matchlenLong(s+l, t+l, src)
+               }
+
+               // Extend backwards
+               for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                       s--
+                       t--
+                       l++
+               }
+               if nextEmit < s {
+                       emitLiteral(dst, src[nextEmit:s])
+               }
+               if false {
+                       if t >= s {
+                               panic(fmt.Sprintln("s-t", s, t))
+                       }
+                       if (s - t) > maxMatchOffset {
+                               panic(fmt.Sprintln("mmo", s-t))
+                       }
+                       if l < baseMatchLength {
+                               panic("bml")
+                       }
+               }
+
+               dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+               repeat = s - t
+               s += l
+               nextEmit = s
+               if nextS >= s {
+                       s = nextS + 1
+               }
+
+               if s >= sLimit {
+                       // Index after match end.
+                       for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+                               cv := load6432(src, i)
+                               e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)}
+                               eLong := &e.bTable[hash7(cv, tableBits)]
+                               eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur
+                       }
+                       goto emitRemainder
+               }
+
+               // Store every long hash in-between and every second short.
+               if true {
+                       for i := nextS + 1; i < s-1; i += 2 {
+                               cv := load6432(src, i)
+                               t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+                               t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+                               eLong := &e.bTable[hash7(cv, tableBits)]
+                               eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+                               e.table[hash4x64(cv, tableBits)] = t
+                               eLong.Cur, eLong.Prev = t, eLong.Cur
+                               eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+                       }
+               }
+
+               // We could immediately start working at s now, but to improve
+               // compression we first update the hash table at s-1 and at s.
+               cv = load6432(src, s)
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
diff --git a/vendor/github.com/klauspost/compress/flate/reverse_bits.go b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
deleted file mode 100644 (file)
index c1a0272..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-var reverseByte = [256]byte{
-       0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-       0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-       0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-       0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-       0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-       0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-       0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-       0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-       0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-       0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-       0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-       0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-       0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-       0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-       0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-       0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-       0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-       0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-       0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-       0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-       0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-       0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-       0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-       0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-       0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-       0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-       0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-       0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-       0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-       0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-       0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-       0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
-}
-
-func reverseUint16(v uint16) uint16 {
-       return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
-}
-
-func reverseBits(number uint16, bitLength byte) uint16 {
-       return reverseUint16(number << uint8(16-bitLength))
-}
diff --git a/vendor/github.com/klauspost/compress/flate/snappy.go b/vendor/github.com/klauspost/compress/flate/snappy.go
deleted file mode 100644 (file)
index 0bbd946..0000000
+++ /dev/null
@@ -1,856 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Modified for deflate by Klaus Post (c) 2015.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package flate
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-func emitLiteral(dst *tokens, lit []byte) {
-       ol := int(dst.n)
-       for i, v := range lit {
-               dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
-       }
-       dst.n += uint16(len(lit))
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst *tokens, offset, length int) {
-       dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
-       dst.n++
-}
-
-type snappyEnc interface {
-       Encode(dst *tokens, src []byte)
-       Reset()
-}
-
-func newSnappy(level int) snappyEnc {
-       switch level {
-       case 1:
-               return &snappyL1{}
-       case 2:
-               return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-       case 3:
-               return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
-       case 4:
-               return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
-       default:
-               panic("invalid level specified")
-       }
-}
-
-const (
-       tableBits       = 14             // Bits used in the table
-       tableSize       = 1 << tableBits // Size of the table
-       tableMask       = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-       tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
-       baseMatchOffset = 1              // The smallest match offset
-       baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
-       maxMatchOffset  = 1 << 15        // The largest match offset
-)
-
-func load32(b []byte, i int) uint32 {
-       b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-       b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-func hash(u uint32) uint32 {
-       return (u * 0x1e35a7bd) >> tableShift
-}
-
-// snappyL1 encapsulates level 1 compression
-type snappyL1 struct{}
-
-func (e *snappyL1) Reset() {}
-
-func (e *snappyL1) Encode(dst *tokens, src []byte) {
-       const (
-               inputMargin            = 16 - 1
-               minNonLiteralBlockSize = 1 + 1 + inputMargin
-       )
-
-       // This check isn't in the Snappy implementation, but there, the caller
-       // instead of the callee handles this case.
-       if len(src) < minNonLiteralBlockSize {
-               // We do not fill the token table.
-               // This will be picked up by caller.
-               dst.n = uint16(len(src))
-               return
-       }
-
-       // Initialize the hash table.
-       //
-       // The table element type is uint16, as s < sLimit and sLimit < len(src)
-       // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
-       var table [tableSize]uint16
-
-       // sLimit is when to stop looking for offset/length copies. The inputMargin
-       // lets us use a fast path for emitLiteral in the main loop, while we are
-       // looking for copies.
-       sLimit := len(src) - inputMargin
-
-       // nextEmit is where in src the next emitLiteral should start from.
-       nextEmit := 0
-
-       // The encoded form must start with a literal, as there are no previous
-       // bytes to copy, so we start looking for hash matches at s == 1.
-       s := 1
-       nextHash := hash(load32(src, s))
-
-       for {
-               // Copied from the C++ snappy implementation:
-               //
-               // Heuristic match skipping: If 32 bytes are scanned with no matches
-               // found, start looking only at every other byte. If 32 more bytes are
-               // scanned (or skipped), look at every third byte, etc.. When a match
-               // is found, immediately go back to looking at every byte. This is a
-               // small loss (~5% performance, ~0.1% density) for compressible data
-               // due to more bookkeeping, but for non-compressible data (such as
-               // JPEG) it's a huge win since the compressor quickly "realizes" the
-               // data is incompressible and doesn't bother looking for matches
-               // everywhere.
-               //
-               // The "skip" variable keeps track of how many bytes there are since
-               // the last match; dividing it by 32 (ie. right-shifting by five) gives
-               // the number of bytes to move ahead for each iteration.
-               skip := 32
-
-               nextS := s
-               candidate := 0
-               for {
-                       s = nextS
-                       bytesBetweenHashLookups := skip >> 5
-                       nextS = s + bytesBetweenHashLookups
-                       skip += bytesBetweenHashLookups
-                       if nextS > sLimit {
-                               goto emitRemainder
-                       }
-                       candidate = int(table[nextHash&tableMask])
-                       table[nextHash&tableMask] = uint16(s)
-                       nextHash = hash(load32(src, nextS))
-                       // TODO: < should be <=, and add a test for that.
-                       if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
-                               break
-                       }
-               }
-
-               // A 4-byte match has been found. We'll later see if more than 4 bytes
-               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-               // them as literal bytes.
-               emitLiteral(dst, src[nextEmit:s])
-
-               // Call emitCopy, and then see if another emitCopy could be our next
-               // move. Repeat until we find no match for the input immediately after
-               // what was consumed by the last emitCopy call.
-               //
-               // If we exit this loop normally then we need to call emitLiteral next,
-               // though we don't yet know how big the literal will be. We handle that
-               // by proceeding to the next iteration of the main loop. We also can
-               // exit this loop via goto if we get close to exhausting the input.
-               for {
-                       // Invariant: we have a 4-byte match at s, and no need to emit any
-                       // literal bytes prior to s.
-                       base := s
-
-                       // Extend the 4-byte match as long as possible.
-                       //
-                       // This is an inlined version of Snappy's:
-                       //      s = extendMatch(src, candidate+4, s+4)
-                       s += 4
-                       s1 := base + maxMatchLength
-                       if s1 > len(src) {
-                               s1 = len(src)
-                       }
-                       a := src[s:s1]
-                       b := src[candidate+4:]
-                       b = b[:len(a)]
-                       l := len(a)
-                       for i := range a {
-                               if a[i] != b[i] {
-                                       l = i
-                                       break
-                               }
-                       }
-                       s += l
-
-                       // matchToken is flate's equivalent of Snappy's emitCopy.
-                       dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
-                       dst.n++
-                       nextEmit = s
-                       if s >= sLimit {
-                               goto emitRemainder
-                       }
-
-                       // We could immediately start working at s now, but to improve
-                       // compression we first update the hash table at s-1 and at s. If
-                       // another emitCopy is not our next move, also calculate nextHash
-                       // at s+1. At least on GOARCH=amd64, these three hash calculations
-                       // are faster as one load64 call (with some shifts) instead of
-                       // three load32 calls.
-                       x := load64(src, s-1)
-                       prevHash := hash(uint32(x >> 0))
-                       table[prevHash&tableMask] = uint16(s - 1)
-                       currHash := hash(uint32(x >> 8))
-                       candidate = int(table[currHash&tableMask])
-                       table[currHash&tableMask] = uint16(s)
-                       // TODO: >= should be >, and add a test for that.
-                       if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
-                               nextHash = hash(uint32(x >> 16))
-                               s++
-                               break
-                       }
-               }
-       }
-
-emitRemainder:
-       if nextEmit < len(src) {
-               emitLiteral(dst, src[nextEmit:])
-       }
-}
-
-type tableEntry struct {
-       val    uint32
-       offset int32
-}
-
-func load3232(b []byte, i int32) uint32 {
-       b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load6432(b []byte, i int32) uint64 {
-       b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyGen struct {
-       prev []byte
-       cur  int32
-}
-
-// snappyGen maintains the table for matches,
-// and the previous byte block for level 2.
-// This is the generic implementation.
-type snappyL2 struct {
-       snappyGen
-       table [tableSize]tableEntry
-}
-
-// EncodeL2 uses a similar algorithm to level 1, but is capable
-// of matching across blocks giving better compression at a small slowdown.
-func (e *snappyL2) Encode(dst *tokens, src []byte) {
-       const (
-               inputMargin            = 16 - 1
-               minNonLiteralBlockSize = 1 + 1 + inputMargin
-       )
-
-       // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-       if e.cur > 1<<30 {
-               for i := range e.table {
-                       e.table[i] = tableEntry{}
-               }
-               e.cur = maxStoreBlockSize
-       }
-
-       // This check isn't in the Snappy implementation, but there, the caller
-       // instead of the callee handles this case.
-       if len(src) < minNonLiteralBlockSize {
-               // We do not fill the token table.
-               // This will be picked up by caller.
-               dst.n = uint16(len(src))
-               e.cur += maxStoreBlockSize
-               e.prev = e.prev[:0]
-               return
-       }
-
-       // sLimit is when to stop looking for offset/length copies. The inputMargin
-       // lets us use a fast path for emitLiteral in the main loop, while we are
-       // looking for copies.
-       sLimit := int32(len(src) - inputMargin)
-
-       // nextEmit is where in src the next emitLiteral should start from.
-       nextEmit := int32(0)
-       s := int32(0)
-       cv := load3232(src, s)
-       nextHash := hash(cv)
-
-       for {
-               // Copied from the C++ snappy implementation:
-               //
-               // Heuristic match skipping: If 32 bytes are scanned with no matches
-               // found, start looking only at every other byte. If 32 more bytes are
-               // scanned (or skipped), look at every third byte, etc.. When a match
-               // is found, immediately go back to looking at every byte. This is a
-               // small loss (~5% performance, ~0.1% density) for compressible data
-               // due to more bookkeeping, but for non-compressible data (such as
-               // JPEG) it's a huge win since the compressor quickly "realizes" the
-               // data is incompressible and doesn't bother looking for matches
-               // everywhere.
-               //
-               // The "skip" variable keeps track of how many bytes there are since
-               // the last match; dividing it by 32 (ie. right-shifting by five) gives
-               // the number of bytes to move ahead for each iteration.
-               skip := int32(32)
-
-               nextS := s
-               var candidate tableEntry
-               for {
-                       s = nextS
-                       bytesBetweenHashLookups := skip >> 5
-                       nextS = s + bytesBetweenHashLookups
-                       skip += bytesBetweenHashLookups
-                       if nextS > sLimit {
-                               goto emitRemainder
-                       }
-                       candidate = e.table[nextHash&tableMask]
-                       now := load3232(src, nextS)
-                       e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
-                       nextHash = hash(now)
-
-                       offset := s - (candidate.offset - e.cur)
-                       if offset >= maxMatchOffset || cv != candidate.val {
-                               // Out of range or not matched.
-                               cv = now
-                               continue
-                       }
-                       break
-               }
-
-               // A 4-byte match has been found. We'll later see if more than 4 bytes
-               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-               // them as literal bytes.
-               emitLiteral(dst, src[nextEmit:s])
-
-               // Call emitCopy, and then see if another emitCopy could be our next
-               // move. Repeat until we find no match for the input immediately after
-               // what was consumed by the last emitCopy call.
-               //
-               // If we exit this loop normally then we need to call emitLiteral next,
-               // though we don't yet know how big the literal will be. We handle that
-               // by proceeding to the next iteration of the main loop. We also can
-               // exit this loop via goto if we get close to exhausting the input.
-               for {
-                       // Invariant: we have a 4-byte match at s, and no need to emit any
-                       // literal bytes prior to s.
-
-                       // Extend the 4-byte match as long as possible.
-                       //
-                       s += 4
-                       t := candidate.offset - e.cur + 4
-                       l := e.matchlen(s, t, src)
-
-                       // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-                       dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-                       dst.n++
-                       s += l
-                       nextEmit = s
-                       if s >= sLimit {
-                               goto emitRemainder
-                       }
-
-                       // We could immediately start working at s now, but to improve
-                       // compression we first update the hash table at s-1 and at s. If
-                       // another emitCopy is not our next move, also calculate nextHash
-                       // at s+1. At least on GOARCH=amd64, these three hash calculations
-                       // are faster as one load64 call (with some shifts) instead of
-                       // three load32 calls.
-                       x := load6432(src, s-1)
-                       prevHash := hash(uint32(x))
-                       e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
-                       x >>= 8
-                       currHash := hash(uint32(x))
-                       candidate = e.table[currHash&tableMask]
-                       e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
-
-                       offset := s - (candidate.offset - e.cur)
-                       if offset >= maxMatchOffset || uint32(x) != candidate.val {
-                               cv = uint32(x >> 8)
-                               nextHash = hash(cv)
-                               s++
-                               break
-                       }
-               }
-       }
-
-emitRemainder:
-       if int(nextEmit) < len(src) {
-               emitLiteral(dst, src[nextEmit:])
-       }
-       e.cur += int32(len(src))
-       e.prev = e.prev[:len(src)]
-       copy(e.prev, src)
-}
-
-type tableEntryPrev struct {
-       Cur  tableEntry
-       Prev tableEntry
-}
-
-// snappyL3
-type snappyL3 struct {
-       snappyGen
-       table [tableSize]tableEntryPrev
-}
-
-// Encode uses a similar algorithm to level 2, will check up to two candidates.
-func (e *snappyL3) Encode(dst *tokens, src []byte) {
-       const (
-               inputMargin            = 16 - 1
-               minNonLiteralBlockSize = 1 + 1 + inputMargin
-       )
-
-       // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-       if e.cur > 1<<30 {
-               for i := range e.table {
-                       e.table[i] = tableEntryPrev{}
-               }
-               e.cur = maxStoreBlockSize
-       }
-
-       // This check isn't in the Snappy implementation, but there, the caller
-       // instead of the callee handles this case.
-       if len(src) < minNonLiteralBlockSize {
-               // We do not fill the token table.
-               // This will be picked up by caller.
-               dst.n = uint16(len(src))
-               e.cur += maxStoreBlockSize
-               e.prev = e.prev[:0]
-               return
-       }
-
-       // sLimit is when to stop looking for offset/length copies. The inputMargin
-       // lets us use a fast path for emitLiteral in the main loop, while we are
-       // looking for copies.
-       sLimit := int32(len(src) - inputMargin)
-
-       // nextEmit is where in src the next emitLiteral should start from.
-       nextEmit := int32(0)
-       s := int32(0)
-       cv := load3232(src, s)
-       nextHash := hash(cv)
-
-       for {
-               // Copied from the C++ snappy implementation:
-               //
-               // Heuristic match skipping: If 32 bytes are scanned with no matches
-               // found, start looking only at every other byte. If 32 more bytes are
-               // scanned (or skipped), look at every third byte, etc.. When a match
-               // is found, immediately go back to looking at every byte. This is a
-               // small loss (~5% performance, ~0.1% density) for compressible data
-               // due to more bookkeeping, but for non-compressible data (such as
-               // JPEG) it's a huge win since the compressor quickly "realizes" the
-               // data is incompressible and doesn't bother looking for matches
-               // everywhere.
-               //
-               // The "skip" variable keeps track of how many bytes there are since
-               // the last match; dividing it by 32 (ie. right-shifting by five) gives
-               // the number of bytes to move ahead for each iteration.
-               skip := int32(32)
-
-               nextS := s
-               var candidate tableEntry
-               for {
-                       s = nextS
-                       bytesBetweenHashLookups := skip >> 5
-                       nextS = s + bytesBetweenHashLookups
-                       skip += bytesBetweenHashLookups
-                       if nextS > sLimit {
-                               goto emitRemainder
-                       }
-                       candidates := e.table[nextHash&tableMask]
-                       now := load3232(src, nextS)
-                       e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-                       nextHash = hash(now)
-
-                       // Check both candidates
-                       candidate = candidates.Cur
-                       if cv == candidate.val {
-                               offset := s - (candidate.offset - e.cur)
-                               if offset < maxMatchOffset {
-                                       break
-                               }
-                       } else {
-                               // We only check if value mismatches.
-                               // Offset will always be invalid in other cases.
-                               candidate = candidates.Prev
-                               if cv == candidate.val {
-                                       offset := s - (candidate.offset - e.cur)
-                                       if offset < maxMatchOffset {
-                                               break
-                                       }
-                               }
-                       }
-                       cv = now
-               }
-
-               // A 4-byte match has been found. We'll later see if more than 4 bytes
-               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-               // them as literal bytes.
-               emitLiteral(dst, src[nextEmit:s])
-
-               // Call emitCopy, and then see if another emitCopy could be our next
-               // move. Repeat until we find no match for the input immediately after
-               // what was consumed by the last emitCopy call.
-               //
-               // If we exit this loop normally then we need to call emitLiteral next,
-               // though we don't yet know how big the literal will be. We handle that
-               // by proceeding to the next iteration of the main loop. We also can
-               // exit this loop via goto if we get close to exhausting the input.
-               for {
-                       // Invariant: we have a 4-byte match at s, and no need to emit any
-                       // literal bytes prior to s.
-
-                       // Extend the 4-byte match as long as possible.
-                       //
-                       s += 4
-                       t := candidate.offset - e.cur + 4
-                       l := e.matchlen(s, t, src)
-
-                       // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-                       dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-                       dst.n++
-                       s += l
-                       nextEmit = s
-                       if s >= sLimit {
-                               goto emitRemainder
-                       }
-
-                       // We could immediately start working at s now, but to improve
-                       // compression we first update the hash table at s-2, s-1 and at s. If
-                       // another emitCopy is not our next move, also calculate nextHash
-                       // at s+1. At least on GOARCH=amd64, these three hash calculations
-                       // are faster as one load64 call (with some shifts) instead of
-                       // three load32 calls.
-                       x := load6432(src, s-2)
-                       prevHash := hash(uint32(x))
-
-                       e.table[prevHash&tableMask] = tableEntryPrev{
-                               Prev: e.table[prevHash&tableMask].Cur,
-                               Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-                       }
-                       x >>= 8
-                       prevHash = hash(uint32(x))
-
-                       e.table[prevHash&tableMask] = tableEntryPrev{
-                               Prev: e.table[prevHash&tableMask].Cur,
-                               Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-                       }
-                       x >>= 8
-                       currHash := hash(uint32(x))
-                       candidates := e.table[currHash&tableMask]
-                       cv = uint32(x)
-                       e.table[currHash&tableMask] = tableEntryPrev{
-                               Prev: candidates.Cur,
-                               Cur:  tableEntry{offset: s + e.cur, val: cv},
-                       }
-
-                       // Check both candidates
-                       candidate = candidates.Cur
-                       if cv == candidate.val {
-                               offset := s - (candidate.offset - e.cur)
-                               if offset < maxMatchOffset {
-                                       continue
-                               }
-                       } else {
-                               // We only check if value mismatches.
-                               // Offset will always be invalid in other cases.
-                               candidate = candidates.Prev
-                               if cv == candidate.val {
-                                       offset := s - (candidate.offset - e.cur)
-                                       if offset < maxMatchOffset {
-                                               continue
-                                       }
-                               }
-                       }
-                       cv = uint32(x >> 8)
-                       nextHash = hash(cv)
-                       s++
-                       break
-               }
-       }
-
-emitRemainder:
-       if int(nextEmit) < len(src) {
-               emitLiteral(dst, src[nextEmit:])
-       }
-       e.cur += int32(len(src))
-       e.prev = e.prev[:len(src)]
-       copy(e.prev, src)
-}
-
-// snappyL4
-type snappyL4 struct {
-       snappyL3
-}
-
-// Encode uses a similar algorithm to level 3,
-// but will check up to two candidates if first isn't long enough.
-func (e *snappyL4) Encode(dst *tokens, src []byte) {
-       const (
-               inputMargin            = 16 - 1
-               minNonLiteralBlockSize = 1 + 1 + inputMargin
-               matchLenGood           = 12
-       )
-
-       // Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
-       if e.cur > 1<<30 {
-               for i := range e.table {
-                       e.table[i] = tableEntryPrev{}
-               }
-               e.cur = maxStoreBlockSize
-       }
-
-       // This check isn't in the Snappy implementation, but there, the caller
-       // instead of the callee handles this case.
-       if len(src) < minNonLiteralBlockSize {
-               // We do not fill the token table.
-               // This will be picked up by caller.
-               dst.n = uint16(len(src))
-               e.cur += maxStoreBlockSize
-               e.prev = e.prev[:0]
-               return
-       }
-
-       // sLimit is when to stop looking for offset/length copies. The inputMargin
-       // lets us use a fast path for emitLiteral in the main loop, while we are
-       // looking for copies.
-       sLimit := int32(len(src) - inputMargin)
-
-       // nextEmit is where in src the next emitLiteral should start from.
-       nextEmit := int32(0)
-       s := int32(0)
-       cv := load3232(src, s)
-       nextHash := hash(cv)
-
-       for {
-               // Copied from the C++ snappy implementation:
-               //
-               // Heuristic match skipping: If 32 bytes are scanned with no matches
-               // found, start looking only at every other byte. If 32 more bytes are
-               // scanned (or skipped), look at every third byte, etc.. When a match
-               // is found, immediately go back to looking at every byte. This is a
-               // small loss (~5% performance, ~0.1% density) for compressible data
-               // due to more bookkeeping, but for non-compressible data (such as
-               // JPEG) it's a huge win since the compressor quickly "realizes" the
-               // data is incompressible and doesn't bother looking for matches
-               // everywhere.
-               //
-               // The "skip" variable keeps track of how many bytes there are since
-               // the last match; dividing it by 32 (ie. right-shifting by five) gives
-               // the number of bytes to move ahead for each iteration.
-               skip := int32(32)
-
-               nextS := s
-               var candidate tableEntry
-               var candidateAlt tableEntry
-               for {
-                       s = nextS
-                       bytesBetweenHashLookups := skip >> 5
-                       nextS = s + bytesBetweenHashLookups
-                       skip += bytesBetweenHashLookups
-                       if nextS > sLimit {
-                               goto emitRemainder
-                       }
-                       candidates := e.table[nextHash&tableMask]
-                       now := load3232(src, nextS)
-                       e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
-                       nextHash = hash(now)
-
-                       // Check both candidates
-                       candidate = candidates.Cur
-                       if cv == candidate.val {
-                               offset := s - (candidate.offset - e.cur)
-                               if offset < maxMatchOffset {
-                                       offset = s - (candidates.Prev.offset - e.cur)
-                                       if cv == candidates.Prev.val && offset < maxMatchOffset {
-                                               candidateAlt = candidates.Prev
-                                       }
-                                       break
-                               }
-                       } else {
-                               // We only check if value mismatches.
-                               // Offset will always be invalid in other cases.
-                               candidate = candidates.Prev
-                               if cv == candidate.val {
-                                       offset := s - (candidate.offset - e.cur)
-                                       if offset < maxMatchOffset {
-                                               break
-                                       }
-                               }
-                       }
-                       cv = now
-               }
-
-               // A 4-byte match has been found. We'll later see if more than 4 bytes
-               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-               // them as literal bytes.
-               emitLiteral(dst, src[nextEmit:s])
-
-               // Call emitCopy, and then see if another emitCopy could be our next
-               // move. Repeat until we find no match for the input immediately after
-               // what was consumed by the last emitCopy call.
-               //
-               // If we exit this loop normally then we need to call emitLiteral next,
-               // though we don't yet know how big the literal will be. We handle that
-               // by proceeding to the next iteration of the main loop. We also can
-               // exit this loop via goto if we get close to exhausting the input.
-               for {
-                       // Invariant: we have a 4-byte match at s, and no need to emit any
-                       // literal bytes prior to s.
-
-                       // Extend the 4-byte match as long as possible.
-                       //
-                       s += 4
-                       t := candidate.offset - e.cur + 4
-                       l := e.matchlen(s, t, src)
-                       // Try alternative candidate if match length < matchLenGood.
-                       if l < matchLenGood-4 && candidateAlt.offset != 0 {
-                               t2 := candidateAlt.offset - e.cur + 4
-                               l2 := e.matchlen(s, t2, src)
-                               if l2 > l {
-                                       l = l2
-                                       t = t2
-                               }
-                       }
-                       // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-                       dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
-                       dst.n++
-                       s += l
-                       nextEmit = s
-                       if s >= sLimit {
-                               goto emitRemainder
-                       }
-
-                       // We could immediately start working at s now, but to improve
-                       // compression we first update the hash table at s-2, s-1 and at s. If
-                       // another emitCopy is not our next move, also calculate nextHash
-                       // at s+1. At least on GOARCH=amd64, these three hash calculations
-                       // are faster as one load64 call (with some shifts) instead of
-                       // three load32 calls.
-                       x := load6432(src, s-2)
-                       prevHash := hash(uint32(x))
-
-                       e.table[prevHash&tableMask] = tableEntryPrev{
-                               Prev: e.table[prevHash&tableMask].Cur,
-                               Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
-                       }
-                       x >>= 8
-                       prevHash = hash(uint32(x))
-
-                       e.table[prevHash&tableMask] = tableEntryPrev{
-                               Prev: e.table[prevHash&tableMask].Cur,
-                               Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
-                       }
-                       x >>= 8
-                       currHash := hash(uint32(x))
-                       candidates := e.table[currHash&tableMask]
-                       cv = uint32(x)
-                       e.table[currHash&tableMask] = tableEntryPrev{
-                               Prev: candidates.Cur,
-                               Cur:  tableEntry{offset: s + e.cur, val: cv},
-                       }
-
-                       // Check both candidates
-                       candidate = candidates.Cur
-                       candidateAlt = tableEntry{}
-                       if cv == candidate.val {
-                               offset := s - (candidate.offset - e.cur)
-                               if offset < maxMatchOffset {
-                                       offset = s - (candidates.Prev.offset - e.cur)
-                                       if cv == candidates.Prev.val && offset < maxMatchOffset {
-                                               candidateAlt = candidates.Prev
-                                       }
-                                       continue
-                               }
-                       } else {
-                               // We only check if value mismatches.
-                               // Offset will always be invalid in other cases.
-                               candidate = candidates.Prev
-                               if cv == candidate.val {
-                                       offset := s - (candidate.offset - e.cur)
-                                       if offset < maxMatchOffset {
-                                               continue
-                                       }
-                               }
-                       }
-                       cv = uint32(x >> 8)
-                       nextHash = hash(cv)
-                       s++
-                       break
-               }
-       }
-
-emitRemainder:
-       if int(nextEmit) < len(src) {
-               emitLiteral(dst, src[nextEmit:])
-       }
-       e.cur += int32(len(src))
-       e.prev = e.prev[:len(src)]
-       copy(e.prev, src)
-}
-
-func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
-       s1 := int(s) + maxMatchLength - 4
-       if s1 > len(src) {
-               s1 = len(src)
-       }
-
-       // If we are inside the current block
-       if t >= 0 {
-               b := src[t:]
-               a := src[s:s1]
-               b = b[:len(a)]
-               // Extend the match to be as long as possible.
-               for i := range a {
-                       if a[i] != b[i] {
-                               return int32(i)
-                       }
-               }
-               return int32(len(a))
-       }
-
-       // We found a match in the previous block.
-       tp := int32(len(e.prev)) + t
-       if tp < 0 {
-               return 0
-       }
-
-       // Extend the match to be as long as possible.
-       a := src[s:s1]
-       b := e.prev[tp:]
-       if len(b) > len(a) {
-               b = b[:len(a)]
-       }
-       a = a[:len(b)]
-       for i := range b {
-               if a[i] != b[i] {
-                       return int32(i)
-               }
-       }
-       n := int32(len(b))
-       a = src[s+n : s1]
-       b = src[:len(a)]
-       for i := range a {
-               if a[i] != b[i] {
-                       return int32(i) + n
-               }
-       }
-       return int32(len(a)) + n
-}
-
-// Reset the encoding table.
-func (e *snappyGen) Reset() {
-       e.prev = e.prev[:0]
-       e.cur += maxMatchOffset + 1
-}
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
new file mode 100644 (file)
index 0000000..524ee0a
--- /dev/null
@@ -0,0 +1,252 @@
+package flate
+
+import (
+       "io"
+       "math"
+)
+
+const (
+       maxStatelessBlock = math.MaxInt16
+
+       slTableBits  = 13
+       slTableSize  = 1 << slTableBits
+       slTableShift = 32 - slTableBits
+)
+
+type statelessWriter struct {
+       dst    io.Writer
+       closed bool
+}
+
+func (s *statelessWriter) Close() error {
+       if s.closed {
+               return nil
+       }
+       s.closed = true
+       // Emit EOF block
+       return StatelessDeflate(s.dst, nil, true)
+}
+
+func (s *statelessWriter) Write(p []byte) (n int, err error) {
+       err = StatelessDeflate(s.dst, p, false)
+       if err != nil {
+               return 0, err
+       }
+       return len(p), nil
+}
+
+func (s *statelessWriter) Reset(w io.Writer) {
+       s.dst = w
+       s.closed = false
+}
+
+// NewStatelessWriter will do compression but without maintaining any state
+// between Write calls.
+// There will be no memory kept between Write calls,
+// but compression and speed will be suboptimal.
+// Because of this, the size of actual Write calls will affect output size.
+func NewStatelessWriter(dst io.Writer) io.WriteCloser {
+       return &statelessWriter{dst: dst}
+}
+
+// StatelessDeflate allows to compress directly to a Writer without retaining state.
+// When returning everything will be flushed.
+func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
+       var dst tokens
+       bw := newHuffmanBitWriter(out)
+       if eof && len(in) == 0 {
+               // Just write an EOF block.
+               // Could be faster...
+               bw.writeStoredHeader(0, true)
+               bw.flush()
+               return bw.err
+       }
+
+       for len(in) > 0 {
+               todo := in
+               if len(todo) > maxStatelessBlock {
+                       todo = todo[:maxStatelessBlock]
+               }
+               in = in[len(todo):]
+               // Compress
+               statelessEnc(&dst, todo)
+               isEof := eof && len(in) == 0
+
+               if dst.n == 0 {
+                       bw.writeStoredHeader(len(todo), isEof)
+                       if bw.err != nil {
+                               return bw.err
+                       }
+                       bw.writeBytes(todo)
+               } else if int(dst.n) > len(todo)-len(todo)>>4 {
+                       // If we removed less than 1/16th, huffman compress the block.
+                       bw.writeBlockHuff(isEof, todo, false)
+               } else {
+                       bw.writeBlockDynamic(&dst, isEof, todo, false)
+               }
+               if bw.err != nil {
+                       return bw.err
+               }
+               dst.Reset()
+       }
+       if !eof {
+               // Align.
+               bw.writeStoredHeader(0, false)
+       }
+       bw.flush()
+       return bw.err
+}
+
+func hashSL(u uint32) uint32 {
+       return (u * 0x1e35a7bd) >> slTableShift
+}
+
+func load3216(b []byte, i int16) uint32 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:4]
+       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6416(b []byte, i int16) uint64 {
+       // Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+       b = b[i:]
+       b = b[:8]
+       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func statelessEnc(dst *tokens, src []byte) {
+       const (
+               inputMargin            = 12 - 1
+               minNonLiteralBlockSize = 1 + 1 + inputMargin
+       )
+
+       type tableEntry struct {
+               offset int16
+       }
+
+       var table [slTableSize]tableEntry
+
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               // We do not fill the token table.
+               // This will be picked up by caller.
+               dst.n = uint16(len(src))
+               return
+       }
+
+       s := int16(1)
+       nextEmit := int16(0)
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := int16(len(src) - inputMargin)
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       cv := load3216(src, s)
+
+       for {
+               const skipLog = 5
+               const doEvery = 2
+
+               nextS := s
+               var candidate tableEntry
+               for {
+                       nextHash := hashSL(cv)
+                       candidate = table[nextHash]
+                       nextS = s + doEvery + (s-nextEmit)>>skipLog
+                       if nextS > sLimit || nextS <= 0 {
+                               goto emitRemainder
+                       }
+
+                       now := load6416(src, nextS)
+                       table[nextHash] = tableEntry{offset: s}
+                       nextHash = hashSL(uint32(now))
+
+                       if cv == load3216(src, candidate.offset) {
+                               table[nextHash] = tableEntry{offset: nextS}
+                               break
+                       }
+
+                       // Do one right away...
+                       cv = uint32(now)
+                       s = nextS
+                       nextS++
+                       candidate = table[nextHash]
+                       now >>= 8
+                       table[nextHash] = tableEntry{offset: s}
+
+                       if cv == load3216(src, candidate.offset) {
+                               table[nextHash] = tableEntry{offset: nextS}
+                               break
+                       }
+                       cv = uint32(now)
+                       s = nextS
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+               for {
+                       // Invariant: we have a 4-byte match at s, and no need to emit any
+                       // literal bytes prior to s.
+
+                       // Extend the 4-byte match as long as possible.
+                       t := candidate.offset
+                       l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
+
+                       // Extend backwards
+                       for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+                               s--
+                               t--
+                               l++
+                       }
+                       if nextEmit < s {
+                               emitLiteral(dst, src[nextEmit:s])
+                       }
+
+                       // Save the match found
+                       dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
+                       s += l
+                       nextEmit = s
+                       if nextS >= s {
+                               s = nextS + 1
+                       }
+                       if s >= sLimit {
+                               goto emitRemainder
+                       }
+
+                       // We could immediately start working at s now, but to improve
+                       // compression we first update the hash table at s-2 and at s. If
+                       // another emitCopy is not our next move, also calculate nextHash
+                       // at s+1. At least on GOARCH=amd64, these three hash calculations
+                       // are faster as one load64 call (with some shifts) instead of
+                       // three load32 calls.
+                       x := load6416(src, s-2)
+                       o := s - 2
+                       prevHash := hashSL(uint32(x))
+                       table[prevHash] = tableEntry{offset: o}
+                       x >>= 16
+                       currHash := hashSL(uint32(x))
+                       candidate = table[currHash]
+                       table[currHash] = tableEntry{offset: o + 2}
+
+                       if uint32(x) != load3216(src, candidate.offset) {
+                               cv = uint32(x >> 8)
+                               s++
+                               break
+                       }
+               }
+       }
+
+emitRemainder:
+       if int(nextEmit) < len(src) {
+               // If nothing was added, don't encode literals.
+               if dst.n == 0 {
+                       return
+               }
+               emitLiteral(dst, src[nextEmit:])
+       }
+}
index 4f275ea61df63fa9915ae53e3c2ea1d4ff048489..b3df0d8941e124226f72748614c5956cf79b3bb5 100644 (file)
@@ -4,7 +4,13 @@
 
 package flate
 
-import "fmt"
+import (
+       "bytes"
+       "encoding/binary"
+       "fmt"
+       "io"
+       "math"
+)
 
 const (
        // 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
@@ -19,7 +25,7 @@ const (
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
 // is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
+var lengthCodes = [256]uint8{
        0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
        9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
        13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
@@ -48,7 +54,37 @@ var lengthCodes = [...]uint32{
        27, 27, 27, 27, 27, 28,
 }
 
-var offsetCodes = [...]uint32{
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+       10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+       14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+       16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+       18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+       19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+       20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+       21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+       22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+       22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+       23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+       23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+       24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
        0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
        8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@@ -67,49 +103,265 @@ var offsetCodes = [...]uint32{
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 }
 
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+       14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+       22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+       24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+       25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
 type token uint32
 
 type tokens struct {
-       tokens [maxStoreBlockSize + 1]token
-       n      uint16 // Must be able to contain maxStoreBlockSize
+       nLits     int
+       extraHist [32]uint16  // codes 256->maxnumlit
+       offHist   [32]uint16  // offset codes
+       litHist   [256]uint16 // codes 0->255
+       n         uint16      // Must be able to contain maxStoreBlockSize
+       tokens    [maxStoreBlockSize + 1]token
+}
+
+func (t *tokens) Reset() {
+       if t.n == 0 {
+               return
+       }
+       t.n = 0
+       t.nLits = 0
+       for i := range t.litHist[:] {
+               t.litHist[i] = 0
+       }
+       for i := range t.extraHist[:] {
+               t.extraHist[i] = 0
+       }
+       for i := range t.offHist[:] {
+               t.offHist[i] = 0
+       }
+}
+
+func (t *tokens) Fill() {
+       if t.n == 0 {
+               return
+       }
+       for i, v := range t.litHist[:] {
+               if v == 0 {
+                       t.litHist[i] = 1
+                       t.nLits++
+               }
+       }
+       for i, v := range t.extraHist[:literalCount-256] {
+               if v == 0 {
+                       t.nLits++
+                       t.extraHist[i] = 1
+               }
+       }
+       for i, v := range t.offHist[:offsetCodeCount] {
+               if v == 0 {
+                       t.offHist[i] = 1
+               }
+       }
 }
 
-// Convert a literal into a literal token.
-func literalToken(literal uint32) token { return token(literalType + literal) }
+func indexTokens(in []token) tokens {
+       var t tokens
+       t.indexTokens(in)
+       return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+       t.Reset()
+       for _, tok := range in {
+               if tok < matchType {
+                       t.tokens[t.n] = tok
+                       t.litHist[tok]++
+                       t.n++
+                       continue
+               }
+               t.AddMatch(uint32(tok.length()), tok.offset())
+       }
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+       ol := int(dst.n)
+       for i, v := range lit {
+               dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+               dst.litHist[v]++
+       }
+       dst.n += uint16(len(lit))
+       dst.nLits += len(lit)
+}
 
-// Convert a < xlength, xoffset > pair into a match token.
-func matchToken(xlength uint32, xoffset uint32) token {
-       return token(matchType + xlength<<lengthShift + xoffset)
+func (t *tokens) AddLiteral(lit byte) {
+       t.tokens[t.n] = token(lit)
+       t.litHist[lit]++
+       t.n++
+       t.nLits++
 }
 
-func matchTokend(xlength uint32, xoffset uint32) token {
-       if xlength > maxMatchLength || xoffset > maxMatchOffset {
-               panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
-               return token(matchType)
+// EstimatedBits will return an minimum size estimated by an *optimal*
+// compression of the block.
+// The size of the block
+func (t *tokens) EstimatedBits() int {
+       shannon := float64(0)
+       bits := int(0)
+       nMatches := 0
+       if t.nLits > 0 {
+               invTotal := 1.0 / float64(t.nLits)
+               for _, v := range t.litHist[:] {
+                       if v > 0 {
+                               n := float64(v)
+                               shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+                       }
+               }
+               // Just add 15 for EOB
+               shannon += 15
+               for _, v := range t.extraHist[1 : literalCount-256] {
+                       if v > 0 {
+                               n := float64(v)
+                               shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+                               bits += int(lengthExtraBits[v&31]) * int(v)
+                               nMatches += int(v)
+                       }
+               }
        }
-       return token(matchType + xlength<<lengthShift + xoffset)
+       if nMatches > 0 {
+               invTotal := 1.0 / float64(nMatches)
+               for _, v := range t.offHist[:offsetCodeCount] {
+                       if v > 0 {
+                               n := float64(v)
+                               shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+                               bits += int(offsetExtraBits[v&31]) * int(n)
+                       }
+               }
+       }
+
+       return int(shannon) + bits
+}
+
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+       if debugDecode {
+               if xlength >= maxMatchLength+baseMatchLength {
+                       panic(fmt.Errorf("invalid length: %v", xlength))
+               }
+               if xoffset >= maxMatchOffset+baseMatchOffset {
+                       panic(fmt.Errorf("invalid offset: %v", xoffset))
+               }
+       }
+       t.nLits++
+       lengthCode := lengthCodes1[uint8(xlength)] & 31
+       t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+       t.extraHist[lengthCode]++
+       t.offHist[offsetCode(xoffset)&31]++
+       t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+       if debugDecode {
+               if xoffset >= maxMatchOffset+baseMatchOffset {
+                       panic(fmt.Errorf("invalid offset: %v", xoffset))
+               }
+       }
+       oc := offsetCode(xoffset) & 31
+       for xlength > 0 {
+               xl := xlength
+               if xl > 258 {
+                       // We need to have at least baseMatchLength left over for next loop.
+                       xl = 258 - baseMatchLength
+               }
+               xlength -= xl
+               xl -= 3
+               t.nLits++
+               lengthCode := lengthCodes1[uint8(xl)] & 31
+               t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+               t.extraHist[lengthCode]++
+               t.offHist[oc]++
+               t.n++
+       }
+}
+
+func (t *tokens) AddEOB() {
+       t.tokens[t.n] = token(endBlockMarker)
+       t.extraHist[0]++
+       t.n++
+}
+
+func (t *tokens) Slice() []token {
+       return t.tokens[:t.n]
+}
+
+// VarInt returns the tokens as varint encoded bytes.
+func (t *tokens) VarInt() []byte {
+       var b = make([]byte, binary.MaxVarintLen32*int(t.n))
+       var off int
+       for _, v := range t.tokens[:t.n] {
+               off += binary.PutUvarint(b[off:], uint64(v))
+       }
+       return b[:off]
+}
+
+// FromVarInt restores t to the varint encoded tokens provided.
+// Any data in t is removed.
+func (t *tokens) FromVarInt(b []byte) error {
+       var buf = bytes.NewReader(b)
+       var toks []token
+       for {
+               r, err := binary.ReadUvarint(buf)
+               if err == io.EOF {
+                       break
+               }
+               if err != nil {
+                       return err
+               }
+               toks = append(toks, token(r))
+       }
+       t.indexTokens(toks)
+       return nil
 }
 
 // Returns the type of a token
 func (t token) typ() uint32 { return uint32(t) & typeMask }
 
 // Returns the literal of a literal token
-func (t token) literal() uint32 { return uint32(t - literalType) }
+func (t token) literal() uint8 { return uint8(t) }
 
 // Returns the extra offset of a match token
 func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+// The code is never more than 8 bits, but is returned as uint32 for convenience.
+func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
+       if false {
+               if off < uint32(len(offsetCodes)) {
+                       return offsetCodes[off&255]
+               } else if off>>7 < uint32(len(offsetCodes)) {
+                       return offsetCodes[(off>>7)&255] + 14
+               } else {
+                       return offsetCodes[(off>>14)&255] + 28
+               }
+       }
        if off < uint32(len(offsetCodes)) {
-               return offsetCodes[off]
-       } else if off>>7 < uint32(len(offsetCodes)) {
-               return offsetCodes[off>>7] + 14
-       } else {
-               return offsetCodes[off>>14] + 28
+               return offsetCodes[uint8(off)]
        }
+       return offsetCodes14[uint8(off>>7)]
 }
index e73fab3f0fdd531367eebeb221014ae82579890d..568b5d4fb8b984e7c56dd80feb04d6d54828da87 100644 (file)
@@ -10,11 +10,11 @@ import (
        "bufio"
        "encoding/binary"
        "errors"
+       "hash/crc32"
        "io"
        "time"
 
        "github.com/klauspost/compress/flate"
-       "github.com/klauspost/crc32"
 )
 
 const (
index a0f3ed0fcf0e35e85d51404e0b879c17f63db8d8..ed0cc148f8c7762952ec590e682d4e39b5003d74 100644 (file)
@@ -7,10 +7,10 @@ package gzip
 import (
        "errors"
        "fmt"
+       "hash/crc32"
        "io"
 
        "github.com/klauspost/compress/flate"
-       "github.com/klauspost/crc32"
 )
 
 // These constants are copied from the flate package, so that code that imports
@@ -22,6 +22,13 @@ const (
        DefaultCompression  = flate.DefaultCompression
        ConstantCompression = flate.ConstantCompression
        HuffmanOnly         = flate.HuffmanOnly
+
+       // StatelessCompression will do compression but without maintaining any state
+       // between Write calls.
+       // There will be no memory kept between Write calls,
+       // but compression and speed will be suboptimal.
+       // Because of this, the size of actual Write calls will affect output size.
+       StatelessCompression = -3
 )
 
 // A Writer is an io.WriteCloser.
@@ -59,7 +66,7 @@ func NewWriter(w io.Writer) *Writer {
 // integer value between BestSpeed and BestCompression inclusive. The error
 // returned will be nil if the level is valid.
 func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
-       if level < HuffmanOnly || level > BestCompression {
+       if level < StatelessCompression || level > BestCompression {
                return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
        }
        z := new(Writer)
@@ -69,9 +76,12 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
 
 func (z *Writer) init(w io.Writer, level int) {
        compressor := z.compressor
-       if compressor != nil {
-               compressor.Reset(w)
+       if level != StatelessCompression {
+               if compressor != nil {
+                       compressor.Reset(w)
+               }
        }
+
        *z = Writer{
                Header: Header{
                        OS: 255, // unknown
@@ -189,12 +199,16 @@ func (z *Writer) Write(p []byte) (int, error) {
                                return n, z.err
                        }
                }
-               if z.compressor == nil {
+
+               if z.compressor == nil && z.level != StatelessCompression {
                        z.compressor, _ = flate.NewWriter(z.w, z.level)
                }
        }
        z.size += uint32(len(p))
        z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+       if z.level == StatelessCompression {
+               return len(p), flate.StatelessDeflate(z.w, p, false)
+       }
        n, z.err = z.compressor.Write(p)
        return n, z.err
 }
@@ -211,7 +225,7 @@ func (z *Writer) Flush() error {
        if z.err != nil {
                return z.err
        }
-       if z.closed {
+       if z.closed || z.level == StatelessCompression {
                return nil
        }
        if !z.wroteHeader {
@@ -240,7 +254,11 @@ func (z *Writer) Close() error {
                        return z.err
                }
        }
-       z.err = z.compressor.Close()
+       if z.level == StatelessCompression {
+               z.err = flate.StatelessDeflate(z.w, nil, true)
+       } else {
+               z.err = z.compressor.Close()
+       }
        if z.err != nil {
                return z.err
        }
diff --git a/vendor/github.com/klauspost/cpuid/.gitignore b/vendor/github.com/klauspost/cpuid/.gitignore
deleted file mode 100644 (file)
index daf913b..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
-*.exe
-*.test
-*.prof
diff --git a/vendor/github.com/klauspost/cpuid/.travis.yml b/vendor/github.com/klauspost/cpuid/.travis.yml
deleted file mode 100644 (file)
index bde823d..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-language: go
-
-go:
-  - 1.3
-  - 1.4
-  - 1.5
-  - 1.6
-  - tip
diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE
deleted file mode 100644 (file)
index 5cec7ee..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2015 Klaus Post
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
diff --git a/vendor/github.com/klauspost/cpuid/README.md b/vendor/github.com/klauspost/cpuid/README.md
deleted file mode 100644 (file)
index b2b6bee..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-# cpuid
-Package cpuid provides information about the CPU running the current program.
-
-CPU features are detected on startup, and kept for fast access through the life of the application.
-Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
-
-You can access the CPU information by accessing the shared CPU variable of the cpuid library.
-
-Package home: https://github.com/klauspost/cpuid
-
-[![GoDoc][1]][2] [![Build Status][3]][4]
-
-[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
-[2]: https://godoc.org/github.com/klauspost/cpuid
-[3]: https://travis-ci.org/klauspost/cpuid.svg
-[4]: https://travis-ci.org/klauspost/cpuid
-
-# features
-## CPU Instructions
-*  **CMOV** (i686 CMOV)
-*  **NX** (NX (No-Execute) bit)
-*  **AMD3DNOW** (AMD 3DNOW)
-*  **AMD3DNOWEXT** (AMD 3DNowExt)
-*  **MMX** (standard MMX)
-*  **MMXEXT** (SSE integer functions or AMD MMX ext)
-*  **SSE** (SSE functions)
-*  **SSE2** (P4 SSE functions)
-*  **SSE3** (Prescott SSE3 functions)
-*  **SSSE3** (Conroe SSSE3 functions)
-*  **SSE4** (Penryn SSE4.1 functions)
-*  **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
-*  **SSE42** (Nehalem SSE4.2 functions)
-*  **AVX** (AVX functions)
-*  **AVX2** (AVX2 functions)
-*  **FMA3** (Intel FMA 3)
-*  **FMA4** (Bulldozer FMA4 functions)
-*  **XOP** (Bulldozer XOP functions)
-*  **F16C** (Half-precision floating-point conversion)
-*  **BMI1** (Bit Manipulation Instruction Set 1)
-*  **BMI2** (Bit Manipulation Instruction Set 2)
-*  **TBM** (AMD Trailing Bit Manipulation)
-*  **LZCNT** (LZCNT instruction)
-*  **POPCNT** (POPCNT instruction)
-*  **AESNI** (Advanced Encryption Standard New Instructions)
-*  **CLMUL** (Carry-less Multiplication)
-*  **HTT** (Hyperthreading (enabled))
-*  **HLE** (Hardware Lock Elision)
-*  **RTM** (Restricted Transactional Memory)
-*  **RDRAND** (RDRAND instruction is available)
-*  **RDSEED** (RDSEED instruction is available)
-*  **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
-*  **SHA** (Intel SHA Extensions)
-*  **AVX512F** (AVX-512 Foundation)
-*  **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
-*  **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
-*  **AVX512PF** (AVX-512 Prefetch Instructions)
-*  **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
-*  **AVX512CD** (AVX-512 Conflict Detection Instructions)
-*  **AVX512BW** (AVX-512 Byte and Word Instructions)
-*  **AVX512VL** (AVX-512 Vector Length Extensions)
-*  **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
-*  **MPX** (Intel MPX (Memory Protection Extensions))
-*  **ERMS** (Enhanced REP MOVSB/STOSB)
-*  **RDTSCP** (RDTSCP Instruction)
-*  **CX16** (CMPXCHG16B Instruction)
-*  **SGX** (Software Guard Extensions, with activation details)
-
-## Performance
-*  **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
-*  **SSE2SLOW** (SSE2 is supported, but usually not faster)
-*  **SSE3SLOW** (SSE3 is supported, but usually not faster)
-*  **ATOM** (Atom processor, some SSSE3 instructions are slower)
-*  **Cache line** (Probable size of a cache line).
-*  **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
-
-## Cpu Vendor/VM
-* **Intel**
-* **AMD**
-* **VIA**
-* **Transmeta**
-* **NSC**
-* **KVM**  (Kernel-based Virtual Machine)
-* **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
-* **VMware**
-* **XenHVM**
-
-# installing
-
-```go get github.com/klauspost/cpuid```
-
-# example
-
-```Go
-package main
-
-import (
-       "fmt"
-       "github.com/klauspost/cpuid"
-)
-
-func main() {
-       // Print basic CPU information:
-       fmt.Println("Name:", cpuid.CPU.BrandName)
-       fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
-       fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
-       fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
-       fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
-       fmt.Println("Features:", cpuid.CPU.Features)
-       fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
-       fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
-       fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
-       fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
-       fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
-
-       // Test if we have a specific feature:
-       if cpuid.CPU.SSE() {
-               fmt.Println("We have Streaming SIMD Extensions")
-       }
-}
-```
-
-Sample output:
-```
->go run main.go
-Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
-PhysicalCores: 2
-ThreadsPerCore: 2
-LogicalCores: 4
-Family 6 Model: 42
-Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
-Cacheline bytes: 64
-We have Streaming SIMD Extensions
-```
-
-# private package
-
-In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
-
-For this purpose all exports are removed, and functions and constants are lowercased.
-
-This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
-
-# license
-
-This code is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go
deleted file mode 100644 (file)
index 9230ca5..0000000
+++ /dev/null
@@ -1,1022 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// Package cpuid provides information about the CPU running the current program.
-//
-// CPU features are detected on startup, and kept for fast access through the life of the application.
-// Currently x86 / x64 (AMD64) is supported.
-//
-// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
-//
-// Package home: https://github.com/klauspost/cpuid
-package cpuid
-
-import "strings"
-
-// Vendor is a representation of a CPU vendor.
-type Vendor int
-
-const (
-       Other Vendor = iota
-       Intel
-       AMD
-       VIA
-       Transmeta
-       NSC
-       KVM  // Kernel-based Virtual Machine
-       MSVM // Microsoft Hyper-V or Windows Virtual PC
-       VMware
-       XenHVM
-)
-
-const (
-       CMOV        = 1 << iota // i686 CMOV
-       NX                      // NX (No-Execute) bit
-       AMD3DNOW                // AMD 3DNOW
-       AMD3DNOWEXT             // AMD 3DNowExt
-       MMX                     // standard MMX
-       MMXEXT                  // SSE integer functions or AMD MMX ext
-       SSE                     // SSE functions
-       SSE2                    // P4 SSE functions
-       SSE3                    // Prescott SSE3 functions
-       SSSE3                   // Conroe SSSE3 functions
-       SSE4                    // Penryn SSE4.1 functions
-       SSE4A                   // AMD Barcelona microarchitecture SSE4a instructions
-       SSE42                   // Nehalem SSE4.2 functions
-       AVX                     // AVX functions
-       AVX2                    // AVX2 functions
-       FMA3                    // Intel FMA 3
-       FMA4                    // Bulldozer FMA4 functions
-       XOP                     // Bulldozer XOP functions
-       F16C                    // Half-precision floating-point conversion
-       BMI1                    // Bit Manipulation Instruction Set 1
-       BMI2                    // Bit Manipulation Instruction Set 2
-       TBM                     // AMD Trailing Bit Manipulation
-       LZCNT                   // LZCNT instruction
-       POPCNT                  // POPCNT instruction
-       AESNI                   // Advanced Encryption Standard New Instructions
-       CLMUL                   // Carry-less Multiplication
-       HTT                     // Hyperthreading (enabled)
-       HLE                     // Hardware Lock Elision
-       RTM                     // Restricted Transactional Memory
-       RDRAND                  // RDRAND instruction is available
-       RDSEED                  // RDSEED instruction is available
-       ADX                     // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-       SHA                     // Intel SHA Extensions
-       AVX512F                 // AVX-512 Foundation
-       AVX512DQ                // AVX-512 Doubleword and Quadword Instructions
-       AVX512IFMA              // AVX-512 Integer Fused Multiply-Add Instructions
-       AVX512PF                // AVX-512 Prefetch Instructions
-       AVX512ER                // AVX-512 Exponential and Reciprocal Instructions
-       AVX512CD                // AVX-512 Conflict Detection Instructions
-       AVX512BW                // AVX-512 Byte and Word Instructions
-       AVX512VL                // AVX-512 Vector Length Extensions
-       AVX512VBMI              // AVX-512 Vector Bit Manipulation Instructions
-       MPX                     // Intel MPX (Memory Protection Extensions)
-       ERMS                    // Enhanced REP MOVSB/STOSB
-       RDTSCP                  // RDTSCP Instruction
-       CX16                    // CMPXCHG16B Instruction
-       SGX                     // Software Guard Extensions
-
-       // Performance indicators
-       SSE2SLOW // SSE2 is supported, but usually not faster
-       SSE3SLOW // SSE3 is supported, but usually not faster
-       ATOM     // Atom processor, some SSSE3 instructions are slower
-)
-
-var flagNames = map[Flags]string{
-       CMOV:        "CMOV",        // i686 CMOV
-       NX:          "NX",          // NX (No-Execute) bit
-       AMD3DNOW:    "AMD3DNOW",    // AMD 3DNOW
-       AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt
-       MMX:         "MMX",         // Standard MMX
-       MMXEXT:      "MMXEXT",      // SSE integer functions or AMD MMX ext
-       SSE:         "SSE",         // SSE functions
-       SSE2:        "SSE2",        // P4 SSE2 functions
-       SSE3:        "SSE3",        // Prescott SSE3 functions
-       SSSE3:       "SSSE3",       // Conroe SSSE3 functions
-       SSE4:        "SSE4.1",      // Penryn SSE4.1 functions
-       SSE4A:       "SSE4A",       // AMD Barcelona microarchitecture SSE4a instructions
-       SSE42:       "SSE4.2",      // Nehalem SSE4.2 functions
-       AVX:         "AVX",         // AVX functions
-       AVX2:        "AVX2",        // AVX functions
-       FMA3:        "FMA3",        // Intel FMA 3
-       FMA4:        "FMA4",        // Bulldozer FMA4 functions
-       XOP:         "XOP",         // Bulldozer XOP functions
-       F16C:        "F16C",        // Half-precision floating-point conversion
-       BMI1:        "BMI1",        // Bit Manipulation Instruction Set 1
-       BMI2:        "BMI2",        // Bit Manipulation Instruction Set 2
-       TBM:         "TBM",         // AMD Trailing Bit Manipulation
-       LZCNT:       "LZCNT",       // LZCNT instruction
-       POPCNT:      "POPCNT",      // POPCNT instruction
-       AESNI:       "AESNI",       // Advanced Encryption Standard New Instructions
-       CLMUL:       "CLMUL",       // Carry-less Multiplication
-       HTT:         "HTT",         // Hyperthreading (enabled)
-       HLE:         "HLE",         // Hardware Lock Elision
-       RTM:         "RTM",         // Restricted Transactional Memory
-       RDRAND:      "RDRAND",      // RDRAND instruction is available
-       RDSEED:      "RDSEED",      // RDSEED instruction is available
-       ADX:         "ADX",         // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-       SHA:         "SHA",         // Intel SHA Extensions
-       AVX512F:     "AVX512F",     // AVX-512 Foundation
-       AVX512DQ:    "AVX512DQ",    // AVX-512 Doubleword and Quadword Instructions
-       AVX512IFMA:  "AVX512IFMA",  // AVX-512 Integer Fused Multiply-Add Instructions
-       AVX512PF:    "AVX512PF",    // AVX-512 Prefetch Instructions
-       AVX512ER:    "AVX512ER",    // AVX-512 Exponential and Reciprocal Instructions
-       AVX512CD:    "AVX512CD",    // AVX-512 Conflict Detection Instructions
-       AVX512BW:    "AVX512BW",    // AVX-512 Byte and Word Instructions
-       AVX512VL:    "AVX512VL",    // AVX-512 Vector Length Extensions
-       AVX512VBMI:  "AVX512VBMI",  // AVX-512 Vector Bit Manipulation Instructions
-       MPX:         "MPX",         // Intel MPX (Memory Protection Extensions)
-       ERMS:        "ERMS",        // Enhanced REP MOVSB/STOSB
-       RDTSCP:      "RDTSCP",      // RDTSCP Instruction
-       CX16:        "CX16",        // CMPXCHG16B Instruction
-       SGX:         "SGX",         // Software Guard Extensions
-
-       // Performance indicators
-       SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster
-       SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster
-       ATOM:     "ATOM",     // Atom processor, some SSSE3 instructions are slower
-
-}
-
-// CPUInfo contains information about the detected system CPU.
-type CPUInfo struct {
-       BrandName      string // Brand name reported by the CPU
-       VendorID       Vendor // Comparable CPU vendor ID
-       Features       Flags  // Features of the CPU
-       PhysicalCores  int    // Number of physical processor cores in your CPU. Will be 0 if undetectable.
-       ThreadsPerCore int    // Number of threads per physical core. Will be 1 if undetectable.
-       LogicalCores   int    // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
-       Family         int    // CPU family number
-       Model          int    // CPU model number
-       CacheLine      int    // Cache line size in bytes. Will be 0 if undetectable.
-       Cache          struct {
-               L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
-               L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
-               L2  int // L2 Cache (per core or shared). Will be -1 if undetected
-               L3  int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
-       }
-       SGX       SGXSupport
-       maxFunc   uint32
-       maxExFunc uint32
-}
-
-var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
-var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-var xgetbv func(index uint32) (eax, edx uint32)
-var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
-
-// CPU contains information about the CPU as detected on startup,
-// or when Detect last was called.
-//
-// Use this as the primary entry point to you data,
-// this way queries are
-var CPU CPUInfo
-
-func init() {
-       initCPU()
-       Detect()
-}
-
-// Detect will re-detect current CPU info.
-// This will replace the content of the exported CPU variable.
-//
-// Unless you expect the CPU to change while you are running your program
-// you should not need to call this function.
-// If you call this, you must ensure that no other goroutine is accessing the
-// exported CPU variable.
-func Detect() {
-       CPU.maxFunc = maxFunctionID()
-       CPU.maxExFunc = maxExtendedFunction()
-       CPU.BrandName = brandName()
-       CPU.CacheLine = cacheLine()
-       CPU.Family, CPU.Model = familyModel()
-       CPU.Features = support()
-       CPU.SGX = sgx(CPU.Features&SGX != 0)
-       CPU.ThreadsPerCore = threadsPerCore()
-       CPU.LogicalCores = logicalCores()
-       CPU.PhysicalCores = physicalCores()
-       CPU.VendorID = vendorID()
-       CPU.cacheSize()
-}
-
-// Generated here: http://play.golang.org/p/BxFH2Gdc0G
-
-// Cmov indicates support of CMOV instructions
-func (c CPUInfo) Cmov() bool {
-       return c.Features&CMOV != 0
-}
-
-// Amd3dnow indicates support of AMD 3DNOW! instructions
-func (c CPUInfo) Amd3dnow() bool {
-       return c.Features&AMD3DNOW != 0
-}
-
-// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
-func (c CPUInfo) Amd3dnowExt() bool {
-       return c.Features&AMD3DNOWEXT != 0
-}
-
-// MMX indicates support of MMX instructions
-func (c CPUInfo) MMX() bool {
-       return c.Features&MMX != 0
-}
-
-// MMXExt indicates support of MMXEXT instructions
-// (SSE integer functions or AMD MMX ext)
-func (c CPUInfo) MMXExt() bool {
-       return c.Features&MMXEXT != 0
-}
-
-// SSE indicates support of SSE instructions
-func (c CPUInfo) SSE() bool {
-       return c.Features&SSE != 0
-}
-
-// SSE2 indicates support of SSE 2 instructions
-func (c CPUInfo) SSE2() bool {
-       return c.Features&SSE2 != 0
-}
-
-// SSE3 indicates support of SSE 3 instructions
-func (c CPUInfo) SSE3() bool {
-       return c.Features&SSE3 != 0
-}
-
-// SSSE3 indicates support of SSSE 3 instructions
-func (c CPUInfo) SSSE3() bool {
-       return c.Features&SSSE3 != 0
-}
-
-// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
-func (c CPUInfo) SSE4() bool {
-       return c.Features&SSE4 != 0
-}
-
-// SSE42 indicates support of SSE4.2 instructions
-func (c CPUInfo) SSE42() bool {
-       return c.Features&SSE42 != 0
-}
-
-// AVX indicates support of AVX instructions
-// and operating system support of AVX instructions
-func (c CPUInfo) AVX() bool {
-       return c.Features&AVX != 0
-}
-
-// AVX2 indicates support of AVX2 instructions
-func (c CPUInfo) AVX2() bool {
-       return c.Features&AVX2 != 0
-}
-
-// FMA3 indicates support of FMA3 instructions
-func (c CPUInfo) FMA3() bool {
-       return c.Features&FMA3 != 0
-}
-
-// FMA4 indicates support of FMA4 instructions
-func (c CPUInfo) FMA4() bool {
-       return c.Features&FMA4 != 0
-}
-
-// XOP indicates support of XOP instructions
-func (c CPUInfo) XOP() bool {
-       return c.Features&XOP != 0
-}
-
-// F16C indicates support of F16C instructions
-func (c CPUInfo) F16C() bool {
-       return c.Features&F16C != 0
-}
-
-// BMI1 indicates support of BMI1 instructions
-func (c CPUInfo) BMI1() bool {
-       return c.Features&BMI1 != 0
-}
-
-// BMI2 indicates support of BMI2 instructions
-func (c CPUInfo) BMI2() bool {
-       return c.Features&BMI2 != 0
-}
-
-// TBM indicates support of TBM instructions
-// (AMD Trailing Bit Manipulation)
-func (c CPUInfo) TBM() bool {
-       return c.Features&TBM != 0
-}
-
-// Lzcnt indicates support of LZCNT instruction
-func (c CPUInfo) Lzcnt() bool {
-       return c.Features&LZCNT != 0
-}
-
-// Popcnt indicates support of POPCNT instruction
-func (c CPUInfo) Popcnt() bool {
-       return c.Features&POPCNT != 0
-}
-
-// HTT indicates the processor has Hyperthreading enabled
-func (c CPUInfo) HTT() bool {
-       return c.Features&HTT != 0
-}
-
-// SSE2Slow indicates that SSE2 may be slow on this processor
-func (c CPUInfo) SSE2Slow() bool {
-       return c.Features&SSE2SLOW != 0
-}
-
-// SSE3Slow indicates that SSE3 may be slow on this processor
-func (c CPUInfo) SSE3Slow() bool {
-       return c.Features&SSE3SLOW != 0
-}
-
-// AesNi indicates support of AES-NI instructions
-// (Advanced Encryption Standard New Instructions)
-func (c CPUInfo) AesNi() bool {
-       return c.Features&AESNI != 0
-}
-
-// Clmul indicates support of CLMUL instructions
-// (Carry-less Multiplication)
-func (c CPUInfo) Clmul() bool {
-       return c.Features&CLMUL != 0
-}
-
-// NX indicates support of NX (No-Execute) bit
-func (c CPUInfo) NX() bool {
-       return c.Features&NX != 0
-}
-
-// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
-func (c CPUInfo) SSE4A() bool {
-       return c.Features&SSE4A != 0
-}
-
-// HLE indicates support of Hardware Lock Elision
-func (c CPUInfo) HLE() bool {
-       return c.Features&HLE != 0
-}
-
-// RTM indicates support of Restricted Transactional Memory
-func (c CPUInfo) RTM() bool {
-       return c.Features&RTM != 0
-}
-
-// Rdrand indicates support of RDRAND instruction is available
-func (c CPUInfo) Rdrand() bool {
-       return c.Features&RDRAND != 0
-}
-
-// Rdseed indicates support of RDSEED instruction is available
-func (c CPUInfo) Rdseed() bool {
-       return c.Features&RDSEED != 0
-}
-
-// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
-func (c CPUInfo) ADX() bool {
-       return c.Features&ADX != 0
-}
-
-// SHA indicates support of Intel SHA Extensions
-func (c CPUInfo) SHA() bool {
-       return c.Features&SHA != 0
-}
-
-// AVX512F indicates support of AVX-512 Foundation
-func (c CPUInfo) AVX512F() bool {
-       return c.Features&AVX512F != 0
-}
-
-// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
-func (c CPUInfo) AVX512DQ() bool {
-       return c.Features&AVX512DQ != 0
-}
-
-// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
-func (c CPUInfo) AVX512IFMA() bool {
-       return c.Features&AVX512IFMA != 0
-}
-
-// AVX512PF indicates support of AVX-512 Prefetch Instructions
-func (c CPUInfo) AVX512PF() bool {
-       return c.Features&AVX512PF != 0
-}
-
-// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
-func (c CPUInfo) AVX512ER() bool {
-       return c.Features&AVX512ER != 0
-}
-
-// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
-func (c CPUInfo) AVX512CD() bool {
-       return c.Features&AVX512CD != 0
-}
-
-// AVX512BW indicates support of AVX-512 Byte and Word Instructions
-func (c CPUInfo) AVX512BW() bool {
-       return c.Features&AVX512BW != 0
-}
-
-// AVX512VL indicates support of AVX-512 Vector Length Extensions
-func (c CPUInfo) AVX512VL() bool {
-       return c.Features&AVX512VL != 0
-}
-
-// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
-func (c CPUInfo) AVX512VBMI() bool {
-       return c.Features&AVX512VBMI != 0
-}
-
-// MPX indicates support of Intel MPX (Memory Protection Extensions)
-func (c CPUInfo) MPX() bool {
-       return c.Features&MPX != 0
-}
-
-// ERMS indicates support of Enhanced REP MOVSB/STOSB
-func (c CPUInfo) ERMS() bool {
-       return c.Features&ERMS != 0
-}
-
-func (c CPUInfo) RDTSCP() bool {
-       return c.Features&RDTSCP != 0
-}
-
-func (c CPUInfo) CX16() bool {
-       return c.Features&CX16 != 0
-}
-
-// Atom indicates an Atom processor
-func (c CPUInfo) Atom() bool {
-       return c.Features&ATOM != 0
-}
-
-// Intel returns true if vendor is recognized as Intel
-func (c CPUInfo) Intel() bool {
-       return c.VendorID == Intel
-}
-
-// AMD returns true if vendor is recognized as AMD
-func (c CPUInfo) AMD() bool {
-       return c.VendorID == AMD
-}
-
-// Transmeta returns true if vendor is recognized as Transmeta
-func (c CPUInfo) Transmeta() bool {
-       return c.VendorID == Transmeta
-}
-
-// NSC returns true if vendor is recognized as National Semiconductor
-func (c CPUInfo) NSC() bool {
-       return c.VendorID == NSC
-}
-
-// VIA returns true if vendor is recognized as VIA
-func (c CPUInfo) VIA() bool {
-       return c.VendorID == VIA
-}
-
-// RTCounter returns the 64-bit time-stamp counter
-// Uses the RDTSCP instruction. The value 0 is returned
-// if the CPU does not support the instruction.
-func (c CPUInfo) RTCounter() uint64 {
-       if !c.RDTSCP() {
-               return 0
-       }
-       a, _, _, d := rdtscpAsm()
-       return uint64(a) | (uint64(d) << 32)
-}
-
-// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
-// This variable is OS dependent, but on Linux contains information
-// about the current cpu/core the code is running on.
-// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
-func (c CPUInfo) Ia32TscAux() uint32 {
-       if !c.RDTSCP() {
-               return 0
-       }
-       _, _, ecx, _ := rdtscpAsm()
-       return ecx
-}
-
-// LogicalCPU will return the Logical CPU the code is currently executing on.
-// This is likely to change when the OS re-schedules the running thread
-// to another CPU.
-// If the current core cannot be detected, -1 will be returned.
-func (c CPUInfo) LogicalCPU() int {
-       if c.maxFunc < 1 {
-               return -1
-       }
-       _, ebx, _, _ := cpuid(1)
-       return int(ebx >> 24)
-}
-
-// VM Will return true if the cpu id indicates we are in
-// a virtual machine. This is only a hint, and will very likely
-// have many false negatives.
-func (c CPUInfo) VM() bool {
-       switch c.VendorID {
-       case MSVM, KVM, VMware, XenHVM:
-               return true
-       }
-       return false
-}
-
-// Flags contains detected cpu features and caracteristics
-type Flags uint64
-
-// String returns a string representation of the detected
-// CPU features.
-func (f Flags) String() string {
-       return strings.Join(f.Strings(), ",")
-}
-
-// Strings returns and array of the detected features.
-func (f Flags) Strings() []string {
-       s := support()
-       r := make([]string, 0, 20)
-       for i := uint(0); i < 64; i++ {
-               key := Flags(1 << i)
-               val := flagNames[key]
-               if s&key != 0 {
-                       r = append(r, val)
-               }
-       }
-       return r
-}
-
-func maxExtendedFunction() uint32 {
-       eax, _, _, _ := cpuid(0x80000000)
-       return eax
-}
-
-func maxFunctionID() uint32 {
-       a, _, _, _ := cpuid(0)
-       return a
-}
-
-func brandName() string {
-       if maxExtendedFunction() >= 0x80000004 {
-               v := make([]uint32, 0, 48)
-               for i := uint32(0); i < 3; i++ {
-                       a, b, c, d := cpuid(0x80000002 + i)
-                       v = append(v, a, b, c, d)
-               }
-               return strings.Trim(string(valAsString(v...)), " ")
-       }
-       return "unknown"
-}
-
-func threadsPerCore() int {
-       mfi := maxFunctionID()
-       if mfi < 0x4 || vendorID() != Intel {
-               return 1
-       }
-
-       if mfi < 0xb {
-               _, b, _, d := cpuid(1)
-               if (d & (1 << 28)) != 0 {
-                       // v will contain logical core count
-                       v := (b >> 16) & 255
-                       if v > 1 {
-                               a4, _, _, _ := cpuid(4)
-                               // physical cores
-                               v2 := (a4 >> 26) + 1
-                               if v2 > 0 {
-                                       return int(v) / int(v2)
-                               }
-                       }
-               }
-               return 1
-       }
-       _, b, _, _ := cpuidex(0xb, 0)
-       if b&0xffff == 0 {
-               return 1
-       }
-       return int(b & 0xffff)
-}
-
-func logicalCores() int {
-       mfi := maxFunctionID()
-       switch vendorID() {
-       case Intel:
-               // Use this on old Intel processors
-               if mfi < 0xb {
-                       if mfi < 1 {
-                               return 0
-                       }
-                       // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
-                       // that can be assigned to logical processors in a physical package.
-                       // The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
-                       _, ebx, _, _ := cpuid(1)
-                       logical := (ebx >> 16) & 0xff
-                       return int(logical)
-               }
-               _, b, _, _ := cpuidex(0xb, 1)
-               return int(b & 0xffff)
-       case AMD:
-               _, b, _, _ := cpuid(1)
-               return int((b >> 16) & 0xff)
-       default:
-               return 0
-       }
-}
-
-func familyModel() (int, int) {
-       if maxFunctionID() < 0x1 {
-               return 0, 0
-       }
-       eax, _, _, _ := cpuid(1)
-       family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
-       model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
-       return int(family), int(model)
-}
-
-func physicalCores() int {
-       switch vendorID() {
-       case Intel:
-               return logicalCores() / threadsPerCore()
-       case AMD:
-               if maxExtendedFunction() >= 0x80000008 {
-                       _, _, c, _ := cpuid(0x80000008)
-                       return int(c&0xff) + 1
-               }
-       }
-       return 0
-}
-
-// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
-var vendorMapping = map[string]Vendor{
-       "AMDisbetter!": AMD,
-       "AuthenticAMD": AMD,
-       "CentaurHauls": VIA,
-       "GenuineIntel": Intel,
-       "TransmetaCPU": Transmeta,
-       "GenuineTMx86": Transmeta,
-       "Geode by NSC": NSC,
-       "VIA VIA VIA ": VIA,
-       "KVMKVMKVMKVM": KVM,
-       "Microsoft Hv": MSVM,
-       "VMwareVMware": VMware,
-       "XenVMMXenVMM": XenHVM,
-}
-
-func vendorID() Vendor {
-       _, b, c, d := cpuid(0)
-       v := valAsString(b, d, c)
-       vend, ok := vendorMapping[string(v)]
-       if !ok {
-               return Other
-       }
-       return vend
-}
-
-func cacheLine() int {
-       if maxFunctionID() < 0x1 {
-               return 0
-       }
-
-       _, ebx, _, _ := cpuid(1)
-       cache := (ebx & 0xff00) >> 5 // cflush size
-       if cache == 0 && maxExtendedFunction() >= 0x80000006 {
-               _, _, ecx, _ := cpuid(0x80000006)
-               cache = ecx & 0xff // cacheline size
-       }
-       // TODO: Read from Cache and TLB Information
-       return int(cache)
-}
-
-func (c *CPUInfo) cacheSize() {
-       c.Cache.L1D = -1
-       c.Cache.L1I = -1
-       c.Cache.L2 = -1
-       c.Cache.L3 = -1
-       vendor := vendorID()
-       switch vendor {
-       case Intel:
-               if maxFunctionID() < 4 {
-                       return
-               }
-               for i := uint32(0); ; i++ {
-                       eax, ebx, ecx, _ := cpuidex(4, i)
-                       cacheType := eax & 15
-                       if cacheType == 0 {
-                               break
-                       }
-                       cacheLevel := (eax >> 5) & 7
-                       coherency := int(ebx&0xfff) + 1
-                       partitions := int((ebx>>12)&0x3ff) + 1
-                       associativity := int((ebx>>22)&0x3ff) + 1
-                       sets := int(ecx) + 1
-                       size := associativity * partitions * coherency * sets
-                       switch cacheLevel {
-                       case 1:
-                               if cacheType == 1 {
-                                       // 1 = Data Cache
-                                       c.Cache.L1D = size
-                               } else if cacheType == 2 {
-                                       // 2 = Instruction Cache
-                                       c.Cache.L1I = size
-                               } else {
-                                       if c.Cache.L1D < 0 {
-                                               c.Cache.L1I = size
-                                       }
-                                       if c.Cache.L1I < 0 {
-                                               c.Cache.L1I = size
-                                       }
-                               }
-                       case 2:
-                               c.Cache.L2 = size
-                       case 3:
-                               c.Cache.L3 = size
-                       }
-               }
-       case AMD:
-               // Untested.
-               if maxExtendedFunction() < 0x80000005 {
-                       return
-               }
-               _, _, ecx, edx := cpuid(0x80000005)
-               c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
-               c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
-
-               if maxExtendedFunction() < 0x80000006 {
-                       return
-               }
-               _, _, ecx, _ = cpuid(0x80000006)
-               c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
-       }
-
-       return
-}
-
-type SGXSupport struct {
-       Available           bool
-       SGX1Supported       bool
-       SGX2Supported       bool
-       MaxEnclaveSizeNot64 int64
-       MaxEnclaveSize64    int64
-}
-
-func sgx(available bool) (rval SGXSupport) {
-       rval.Available = available
-
-       if !available {
-               return
-       }
-
-       a, _, _, d := cpuidex(0x12, 0)
-       rval.SGX1Supported = a&0x01 != 0
-       rval.SGX2Supported = a&0x02 != 0
-       rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF)     // pow 2
-       rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
-
-       return
-}
-
-func support() Flags {
-       mfi := maxFunctionID()
-       vend := vendorID()
-       if mfi < 0x1 {
-               return 0
-       }
-       rval := uint64(0)
-       _, _, c, d := cpuid(1)
-       if (d & (1 << 15)) != 0 {
-               rval |= CMOV
-       }
-       if (d & (1 << 23)) != 0 {
-               rval |= MMX
-       }
-       if (d & (1 << 25)) != 0 {
-               rval |= MMXEXT
-       }
-       if (d & (1 << 25)) != 0 {
-               rval |= SSE
-       }
-       if (d & (1 << 26)) != 0 {
-               rval |= SSE2
-       }
-       if (c & 1) != 0 {
-               rval |= SSE3
-       }
-       if (c & 0x00000200) != 0 {
-               rval |= SSSE3
-       }
-       if (c & 0x00080000) != 0 {
-               rval |= SSE4
-       }
-       if (c & 0x00100000) != 0 {
-               rval |= SSE42
-       }
-       if (c & (1 << 25)) != 0 {
-               rval |= AESNI
-       }
-       if (c & (1 << 1)) != 0 {
-               rval |= CLMUL
-       }
-       if c&(1<<23) != 0 {
-               rval |= POPCNT
-       }
-       if c&(1<<30) != 0 {
-               rval |= RDRAND
-       }
-       if c&(1<<29) != 0 {
-               rval |= F16C
-       }
-       if c&(1<<13) != 0 {
-               rval |= CX16
-       }
-       if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
-               if threadsPerCore() > 1 {
-                       rval |= HTT
-               }
-       }
-
-       // Check XGETBV, OXSAVE and AVX bits
-       if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
-               // Check for OS support
-               eax, _ := xgetbv(0)
-               if (eax & 0x6) == 0x6 {
-                       rval |= AVX
-                       if (c & 0x00001000) != 0 {
-                               rval |= FMA3
-                       }
-               }
-       }
-
-       // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
-       if mfi >= 7 {
-               _, ebx, ecx, _ := cpuidex(7, 0)
-               if (rval&AVX) != 0 && (ebx&0x00000020) != 0 {
-                       rval |= AVX2
-               }
-               if (ebx & 0x00000008) != 0 {
-                       rval |= BMI1
-                       if (ebx & 0x00000100) != 0 {
-                               rval |= BMI2
-                       }
-               }
-               if ebx&(1<<2) != 0 {
-                       rval |= SGX
-               }
-               if ebx&(1<<4) != 0 {
-                       rval |= HLE
-               }
-               if ebx&(1<<9) != 0 {
-                       rval |= ERMS
-               }
-               if ebx&(1<<11) != 0 {
-                       rval |= RTM
-               }
-               if ebx&(1<<14) != 0 {
-                       rval |= MPX
-               }
-               if ebx&(1<<18) != 0 {
-                       rval |= RDSEED
-               }
-               if ebx&(1<<19) != 0 {
-                       rval |= ADX
-               }
-               if ebx&(1<<29) != 0 {
-                       rval |= SHA
-               }
-
-               // Only detect AVX-512 features if XGETBV is supported
-               if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
-                       // Check for OS support
-                       eax, _ := xgetbv(0)
-
-                       // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-                       // ZMM16-ZMM31 state are enabled by OS)
-                       /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
-                       if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
-                               if ebx&(1<<16) != 0 {
-                                       rval |= AVX512F
-                               }
-                               if ebx&(1<<17) != 0 {
-                                       rval |= AVX512DQ
-                               }
-                               if ebx&(1<<21) != 0 {
-                                       rval |= AVX512IFMA
-                               }
-                               if ebx&(1<<26) != 0 {
-                                       rval |= AVX512PF
-                               }
-                               if ebx&(1<<27) != 0 {
-                                       rval |= AVX512ER
-                               }
-                               if ebx&(1<<28) != 0 {
-                                       rval |= AVX512CD
-                               }
-                               if ebx&(1<<30) != 0 {
-                                       rval |= AVX512BW
-                               }
-                               if ebx&(1<<31) != 0 {
-                                       rval |= AVX512VL
-                               }
-                               // ecx
-                               if ecx&(1<<1) != 0 {
-                                       rval |= AVX512VBMI
-                               }
-                       }
-               }
-       }
-
-       if maxExtendedFunction() >= 0x80000001 {
-               _, _, c, d := cpuid(0x80000001)
-               if (c & (1 << 5)) != 0 {
-                       rval |= LZCNT
-                       rval |= POPCNT
-               }
-               if (d & (1 << 31)) != 0 {
-                       rval |= AMD3DNOW
-               }
-               if (d & (1 << 30)) != 0 {
-                       rval |= AMD3DNOWEXT
-               }
-               if (d & (1 << 23)) != 0 {
-                       rval |= MMX
-               }
-               if (d & (1 << 22)) != 0 {
-                       rval |= MMXEXT
-               }
-               if (c & (1 << 6)) != 0 {
-                       rval |= SSE4A
-               }
-               if d&(1<<20) != 0 {
-                       rval |= NX
-               }
-               if d&(1<<27) != 0 {
-                       rval |= RDTSCP
-               }
-
-               /* Allow for selectively disabling SSE2 functions on AMD processors
-                  with SSE2 support but not SSE4a. This includes Athlon64, some
-                  Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
-                  than SSE2 often enough to utilize this special-case flag.
-                  AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
-                  so that SSE2 is used unless explicitly disabled by checking
-                  AV_CPU_FLAG_SSE2SLOW. */
-               if vendorID() != Intel &&
-                       rval&SSE2 != 0 && (c&0x00000040) == 0 {
-                       rval |= SSE2SLOW
-               }
-
-               /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-                * used unless the OS has AVX support. */
-               if (rval & AVX) != 0 {
-                       if (c & 0x00000800) != 0 {
-                               rval |= XOP
-                       }
-                       if (c & 0x00010000) != 0 {
-                               rval |= FMA4
-                       }
-               }
-
-               if vendorID() == Intel {
-                       family, model := familyModel()
-                       if family == 6 && (model == 9 || model == 13 || model == 14) {
-                               /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
-                                * 6/14 (core1 "yonah") theoretically support sse2, but it's
-                                * usually slower than mmx. */
-                               if (rval & SSE2) != 0 {
-                                       rval |= SSE2SLOW
-                               }
-                               if (rval & SSE3) != 0 {
-                                       rval |= SSE3SLOW
-                               }
-                       }
-                       /* The Atom processor has SSSE3 support, which is useful in many cases,
-                        * but sometimes the SSSE3 version is slower than the SSE2 equivalent
-                        * on the Atom, but is generally faster on other processors supporting
-                        * SSSE3. This flag allows for selectively disabling certain SSSE3
-                        * functions on the Atom. */
-                       if family == 6 && model == 28 {
-                               rval |= ATOM
-                       }
-               }
-       }
-       return Flags(rval)
-}
-
-func valAsString(values ...uint32) []byte {
-       r := make([]byte, 4*len(values))
-       for i, v := range values {
-               dst := r[i*4:]
-               dst[0] = byte(v & 0xff)
-               dst[1] = byte((v >> 8) & 0xff)
-               dst[2] = byte((v >> 16) & 0xff)
-               dst[3] = byte((v >> 24) & 0xff)
-               switch {
-               case dst[0] == 0:
-                       return r[:i*4]
-               case dst[1] == 0:
-                       return r[:i*4+1]
-               case dst[2] == 0:
-                       return r[:i*4+2]
-               case dst[3] == 0:
-                       return r[:i*4+3]
-               }
-       }
-       return r
-}
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s
deleted file mode 100644 (file)
index 4d73171..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build 386,!gccgo
-
-// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuid(SB), 7, $0
-       XORL CX, CX
-       MOVL op+0(FP), AX
-       CPUID
-       MOVL AX, eax+4(FP)
-       MOVL BX, ebx+8(FP)
-       MOVL CX, ecx+12(FP)
-       MOVL DX, edx+16(FP)
-       RET
-
-// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuidex(SB), 7, $0
-       MOVL op+0(FP), AX
-       MOVL op2+4(FP), CX
-       CPUID
-       MOVL AX, eax+8(FP)
-       MOVL BX, ebx+12(FP)
-       MOVL CX, ecx+16(FP)
-       MOVL DX, edx+20(FP)
-       RET
-
-// func xgetbv(index uint32) (eax, edx uint32)
-TEXT ·asmXgetbv(SB), 7, $0
-       MOVL index+0(FP), CX
-       BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
-       MOVL AX, eax+4(FP)
-       MOVL DX, edx+8(FP)
-       RET
-
-// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-TEXT ·asmRdtscpAsm(SB), 7, $0
-       BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
-       MOVL AX, eax+0(FP)
-       MOVL BX, ebx+4(FP)
-       MOVL CX, ecx+8(FP)
-       MOVL DX, edx+12(FP)
-       RET
diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
deleted file mode 100644 (file)
index 3c1d60e..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-//+build amd64,!gccgo
-
-// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuid(SB), 7, $0
-       XORQ CX, CX
-       MOVL op+0(FP), AX
-       CPUID
-       MOVL AX, eax+8(FP)
-       MOVL BX, ebx+12(FP)
-       MOVL CX, ecx+16(FP)
-       MOVL DX, edx+20(FP)
-       RET
-
-// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·asmCpuidex(SB), 7, $0
-       MOVL op+0(FP), AX
-       MOVL op2+4(FP), CX
-       CPUID
-       MOVL AX, eax+8(FP)
-       MOVL BX, ebx+12(FP)
-       MOVL CX, ecx+16(FP)
-       MOVL DX, edx+20(FP)
-       RET
-
-// func asmXgetbv(index uint32) (eax, edx uint32)
-TEXT ·asmXgetbv(SB), 7, $0
-       MOVL index+0(FP), CX
-       BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
-       MOVL AX, eax+8(FP)
-       MOVL DX, edx+12(FP)
-       RET
-
-// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-TEXT ·asmRdtscpAsm(SB), 7, $0
-       BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
-       MOVL AX, eax+0(FP)
-       MOVL BX, ebx+4(FP)
-       MOVL CX, ecx+8(FP)
-       MOVL DX, edx+12(FP)
-       RET
diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go
deleted file mode 100644 (file)
index a5f04dd..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build 386,!gccgo amd64,!gccgo
-
-package cpuid
-
-func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
-func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
-func asmXgetbv(index uint32) (eax, edx uint32)
-func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
-
-func initCPU() {
-       cpuid = asmCpuid
-       cpuidex = asmCpuidex
-       xgetbv = asmXgetbv
-       rdtscpAsm = asmRdtscpAsm
-}
diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go
deleted file mode 100644 (file)
index 909c5d9..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
-
-// +build !amd64,!386 gccgo
-
-package cpuid
-
-func initCPU() {
-       cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
-               return 0, 0, 0, 0
-       }
-
-       cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
-               return 0, 0, 0, 0
-       }
-
-       xgetbv = func(index uint32) (eax, edx uint32) {
-               return 0, 0
-       }
-
-       rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
-               return 0, 0, 0, 0
-       }
-}
diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go
deleted file mode 100644 (file)
index c060b81..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-package cpuid
-
-//go:generate go run private-gen.go
diff --git a/vendor/github.com/klauspost/crc32/.gitignore b/vendor/github.com/klauspost/crc32/.gitignore
deleted file mode 100644 (file)
index daf913b..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
-*.exe
-*.test
-*.prof
diff --git a/vendor/github.com/klauspost/crc32/.travis.yml b/vendor/github.com/klauspost/crc32/.travis.yml
deleted file mode 100644 (file)
index de64ae4..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-language: go\r
-\r
-go:\r
-  - 1.3\r
-  - 1.4\r
-  - 1.5\r
-  - 1.6\r
-  - 1.7\r
-  - tip\r
-\r
-script: \r
- - go test -v .\r
- - go test -v -race .\r
diff --git a/vendor/github.com/klauspost/crc32/LICENSE b/vendor/github.com/klauspost/crc32/LICENSE
deleted file mode 100644 (file)
index 4fd5963..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2012 The Go Authors. All rights reserved.
-Copyright (c) 2015 Klaus Post
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md
deleted file mode 100644 (file)
index 029625d..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-# crc32
-CRC32 hash with x64 optimizations
-
-This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
-
-[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
-
-# usage
-
-Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
-
-Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
-
-# changes
-* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
-* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
-
-
-# performance
-
-For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
-
-
-For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
-```
-benchmark            old ns/op     new ns/op     delta
-BenchmarkCrc32KB     99955         10258         -89.74%
-
-benchmark            old MB/s     new MB/s     speedup
-BenchmarkCrc32KB     327.83       3194.20      9.74x
-```
-
-For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
-
-Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
-
-```
-Std:   Standard Go 1.5 library
-Crc:   Indicates IEEE type CRC.
-40B:   Size of each slice encoded.
-NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
-Castagnoli: Castagnoli CRC type.
-
-BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
-BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
-BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
-
-BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
-BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
-BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
-
-BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
-BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
-BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
-
-BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
-BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
-BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
-BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
-BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
-BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
-BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
-BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
-BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
-BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
-BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
-BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
-BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
-BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
-BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
-```
-
-The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
-
-However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
-
-# license
-
-Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go
deleted file mode 100644 (file)
index 8aa91b1..0000000
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
-// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
-// information.
-//
-// Polynomials are represented in LSB-first form also known as reversed representation.
-//
-// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
-// for information.
-package crc32
-
-import (
-       "hash"
-       "sync"
-)
-
-// The size of a CRC-32 checksum in bytes.
-const Size = 4
-
-// Predefined polynomials.
-const (
-       // IEEE is by far and away the most common CRC-32 polynomial.
-       // Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
-       IEEE = 0xedb88320
-
-       // Castagnoli's polynomial, used in iSCSI.
-       // Has better error detection characteristics than IEEE.
-       // http://dx.doi.org/10.1109/26.231911
-       Castagnoli = 0x82f63b78
-
-       // Koopman's polynomial.
-       // Also has better error detection characteristics than IEEE.
-       // http://dx.doi.org/10.1109/DSN.2002.1028931
-       Koopman = 0xeb31d82e
-)
-
-// Table is a 256-word table representing the polynomial for efficient processing.
-type Table [256]uint32
-
-// This file makes use of functions implemented in architecture-specific files.
-// The interface that they implement is as follows:
-//
-//    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
-//    // algorithm is available.
-//    archAvailableIEEE() bool
-//
-//    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
-//    // It can only be called if archAvailableIEEE() returns true.
-//    archInitIEEE()
-//
-//    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
-//    // archInitIEEE() was previously called.
-//    archUpdateIEEE(crc uint32, p []byte) uint32
-//
-//    // archAvailableCastagnoli reports whether an architecture-specific
-//    // CRC32-C algorithm is available.
-//    archAvailableCastagnoli() bool
-//
-//    // archInitCastagnoli initializes the architecture-specific CRC32-C
-//    // algorithm. It can only be called if archAvailableCastagnoli() returns
-//    // true.
-//    archInitCastagnoli()
-//
-//    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
-//    // if archInitCastagnoli() was previously called.
-//    archUpdateCastagnoli(crc uint32, p []byte) uint32
-
-// castagnoliTable points to a lazily initialized Table for the Castagnoli
-// polynomial. MakeTable will always return this value when asked to make a
-// Castagnoli table so we can compare against it to find when the caller is
-// using this polynomial.
-var castagnoliTable *Table
-var castagnoliTable8 *slicing8Table
-var castagnoliArchImpl bool
-var updateCastagnoli func(crc uint32, p []byte) uint32
-var castagnoliOnce sync.Once
-
-func castagnoliInit() {
-       castagnoliTable = simpleMakeTable(Castagnoli)
-       castagnoliArchImpl = archAvailableCastagnoli()
-
-       if castagnoliArchImpl {
-               archInitCastagnoli()
-               updateCastagnoli = archUpdateCastagnoli
-       } else {
-               // Initialize the slicing-by-8 table.
-               castagnoliTable8 = slicingMakeTable(Castagnoli)
-               updateCastagnoli = func(crc uint32, p []byte) uint32 {
-                       return slicingUpdate(crc, castagnoliTable8, p)
-               }
-       }
-}
-
-// IEEETable is the table for the IEEE polynomial.
-var IEEETable = simpleMakeTable(IEEE)
-
-// ieeeTable8 is the slicing8Table for IEEE
-var ieeeTable8 *slicing8Table
-var ieeeArchImpl bool
-var updateIEEE func(crc uint32, p []byte) uint32
-var ieeeOnce sync.Once
-
-func ieeeInit() {
-       ieeeArchImpl = archAvailableIEEE()
-
-       if ieeeArchImpl {
-               archInitIEEE()
-               updateIEEE = archUpdateIEEE
-       } else {
-               // Initialize the slicing-by-8 table.
-               ieeeTable8 = slicingMakeTable(IEEE)
-               updateIEEE = func(crc uint32, p []byte) uint32 {
-                       return slicingUpdate(crc, ieeeTable8, p)
-               }
-       }
-}
-
-// MakeTable returns a Table constructed from the specified polynomial.
-// The contents of this Table must not be modified.
-func MakeTable(poly uint32) *Table {
-       switch poly {
-       case IEEE:
-               ieeeOnce.Do(ieeeInit)
-               return IEEETable
-       case Castagnoli:
-               castagnoliOnce.Do(castagnoliInit)
-               return castagnoliTable
-       }
-       return simpleMakeTable(poly)
-}
-
-// digest represents the partial evaluation of a checksum.
-type digest struct {
-       crc uint32
-       tab *Table
-}
-
-// New creates a new hash.Hash32 computing the CRC-32 checksum
-// using the polynomial represented by the Table.
-// Its Sum method will lay the value out in big-endian byte order.
-func New(tab *Table) hash.Hash32 {
-       if tab == IEEETable {
-               ieeeOnce.Do(ieeeInit)
-       }
-       return &digest{0, tab}
-}
-
-// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
-// using the IEEE polynomial.
-// Its Sum method will lay the value out in big-endian byte order.
-func NewIEEE() hash.Hash32 { return New(IEEETable) }
-
-func (d *digest) Size() int { return Size }
-
-func (d *digest) BlockSize() int { return 1 }
-
-func (d *digest) Reset() { d.crc = 0 }
-
-// Update returns the result of adding the bytes in p to the crc.
-func Update(crc uint32, tab *Table, p []byte) uint32 {
-       switch tab {
-       case castagnoliTable:
-               return updateCastagnoli(crc, p)
-       case IEEETable:
-               // Unfortunately, because IEEETable is exported, IEEE may be used without a
-               // call to MakeTable. We have to make sure it gets initialized in that case.
-               ieeeOnce.Do(ieeeInit)
-               return updateIEEE(crc, p)
-       default:
-               return simpleUpdate(crc, tab, p)
-       }
-}
-
-func (d *digest) Write(p []byte) (n int, err error) {
-       switch d.tab {
-       case castagnoliTable:
-               d.crc = updateCastagnoli(d.crc, p)
-       case IEEETable:
-               // We only create digest objects through New() which takes care of
-               // initialization in this case.
-               d.crc = updateIEEE(d.crc, p)
-       default:
-               d.crc = simpleUpdate(d.crc, d.tab, p)
-       }
-       return len(p), nil
-}
-
-func (d *digest) Sum32() uint32 { return d.crc }
-
-func (d *digest) Sum(in []byte) []byte {
-       s := d.Sum32()
-       return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
-}
-
-// Checksum returns the CRC-32 checksum of data
-// using the polynomial represented by the Table.
-func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
-
-// ChecksumIEEE returns the CRC-32 checksum of data
-// using the IEEE polynomial.
-func ChecksumIEEE(data []byte) uint32 {
-       ieeeOnce.Do(ieeeInit)
-       return updateIEEE(0, data)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go
deleted file mode 100644 (file)
index af2a0b8..0000000
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
-// description of the interface that each architecture-specific file
-// implements.
-
-package crc32
-
-import "unsafe"
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// and IEEE CRC.
-
-// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
-// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
-func haveSSE41() bool
-func haveSSE42() bool
-func haveCLMUL() bool
-
-// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42Triple(
-       crcA, crcB, crcC uint32,
-       a, b, c []byte,
-       rounds uint32,
-) (retA uint32, retB uint32, retC uint32)
-
-// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
-// instruction as well as SSE 4.1.
-//go:noescape
-func ieeeCLMUL(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-var useFastIEEE = haveCLMUL() && haveSSE41()
-
-const castagnoliK1 = 168
-const castagnoliK2 = 1344
-
-type sse42Table [4]Table
-
-var castagnoliSSE42TableK1 *sse42Table
-var castagnoliSSE42TableK2 *sse42Table
-
-func archAvailableCastagnoli() bool {
-       return sse42
-}
-
-func archInitCastagnoli() {
-       if !sse42 {
-               panic("arch-specific Castagnoli not available")
-       }
-       castagnoliSSE42TableK1 = new(sse42Table)
-       castagnoliSSE42TableK2 = new(sse42Table)
-       // See description in updateCastagnoli.
-       //    t[0][i] = CRC(i000, O)
-       //    t[1][i] = CRC(0i00, O)
-       //    t[2][i] = CRC(00i0, O)
-       //    t[3][i] = CRC(000i, O)
-       // where O is a sequence of K zeros.
-       var tmp [castagnoliK2]byte
-       for b := 0; b < 4; b++ {
-               for i := 0; i < 256; i++ {
-                       val := uint32(i) << uint32(b*8)
-                       castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
-                       castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
-               }
-       }
-}
-
-// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
-// table given) with the given initial crc value. This corresponds to
-// CRC(crc, O) in the description in updateCastagnoli.
-func castagnoliShift(table *sse42Table, crc uint32) uint32 {
-       return table[3][crc>>24] ^
-               table[2][(crc>>16)&0xFF] ^
-               table[1][(crc>>8)&0xFF] ^
-               table[0][crc&0xFF]
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-       if !sse42 {
-               panic("not available")
-       }
-
-       // This method is inspired from the algorithm in Intel's white paper:
-       //    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
-       // The same strategy of splitting the buffer in three is used but the
-       // combining calculation is different; the complete derivation is explained
-       // below.
-       //
-       // -- The basic idea --
-       //
-       // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
-       // time. In recent Intel architectures the instruction takes 3 cycles;
-       // however the processor can pipeline up to three instructions if they
-       // don't depend on each other.
-       //
-       // Roughly this means that we can process three buffers in about the same
-       // time we can process one buffer.
-       //
-       // The idea is then to split the buffer in three, CRC the three pieces
-       // separately and then combine the results.
-       //
-       // Combining the results requires precomputed tables, so we must choose a
-       // fixed buffer length to optimize. The longer the length, the faster; but
-       // only buffers longer than this length will use the optimization. We choose
-       // two cutoffs and compute tables for both:
-       //  - one around 512: 168*3=504
-       //  - one around 4KB: 1344*3=4032
-       //
-       // -- The nitty gritty --
-       //
-       // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
-       // initial non-inverted CRC I). This function has the following properties:
-       //   (a) CRC(I, AB) = CRC(CRC(I, A), B)
-       //   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
-       //
-       // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
-       // K bytes each, where K is a fixed constant. Let O be the sequence of K zero
-       // bytes.
-       //
-       // CRC(I, ABC) = CRC(I, ABO xor C)
-       //             = CRC(I, ABO) xor CRC(0, C)
-       //             = CRC(CRC(I, AB), O) xor CRC(0, C)
-       //             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
-       //             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
-       //             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
-       //
-       // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
-       // and CRC(0, C) efficiently.  We just need to find a way to quickly compute
-       // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
-       // values; since we can't have a 32-bit table, we break it up into four
-       // 8-bit tables:
-       //
-       //    CRC(uvwx, O) = CRC(u000, O) xor
-       //                   CRC(0v00, O) xor
-       //                   CRC(00w0, O) xor
-       //                   CRC(000x, O)
-       //
-       // We can compute tables corresponding to the four terms for all 8-bit
-       // values.
-
-       crc = ^crc
-
-       // If a buffer is long enough to use the optimization, process the first few
-       // bytes to align the buffer to an 8 byte boundary (if necessary).
-       if len(p) >= castagnoliK1*3 {
-               delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
-               if delta != 0 {
-                       delta = 8 - delta
-                       crc = castagnoliSSE42(crc, p[:delta])
-                       p = p[delta:]
-               }
-       }
-
-       // Process 3*K2 at a time.
-       for len(p) >= castagnoliK2*3 {
-               // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-               crcA, crcB, crcC := castagnoliSSE42Triple(
-                       crc, 0, 0,
-                       p, p[castagnoliK2:], p[castagnoliK2*2:],
-                       castagnoliK2/24)
-
-               // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-               crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
-               // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-               crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
-               p = p[castagnoliK2*3:]
-       }
-
-       // Process 3*K1 at a time.
-       for len(p) >= castagnoliK1*3 {
-               // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-               crcA, crcB, crcC := castagnoliSSE42Triple(
-                       crc, 0, 0,
-                       p, p[castagnoliK1:], p[castagnoliK1*2:],
-                       castagnoliK1/24)
-
-               // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-               crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
-               // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-               crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
-               p = p[castagnoliK1*3:]
-       }
-
-       // Use the simple implementation for what's left.
-       crc = castagnoliSSE42(crc, p)
-       return ^crc
-}
-
-func archAvailableIEEE() bool {
-       return useFastIEEE
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-       if !useFastIEEE {
-               panic("not available")
-       }
-       // We still use slicing-by-8 for small buffers.
-       archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-       if !useFastIEEE {
-               panic("not available")
-       }
-
-       if len(p) >= 64 {
-               left := len(p) & 15
-               do := len(p) - left
-               crc = ^ieeeCLMUL(^crc, p[:do])
-               p = p[do:]
-       }
-       if len(p) == 0 {
-               return crc
-       }
-       return slicingUpdate(crc, archIeeeTable8, p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s
deleted file mode 100644 (file)
index e8a7941..0000000
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
-//
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-       MOVL crc+0(FP), AX    // CRC value
-       MOVQ p+8(FP), SI      // data pointer
-       MOVQ p_len+16(FP), CX // len(p)
-
-       // If there are fewer than 8 bytes to process, skip alignment.
-       CMPQ CX, $8
-       JL   less_than_8
-
-       MOVQ SI, BX
-       ANDQ $7, BX
-       JZ   aligned
-
-       // Process the first few bytes to 8-byte align the input.
-
-       // BX = 8 - BX. We need to process this many bytes to align.
-       SUBQ $1, BX
-       XORQ $7, BX
-
-       BTQ $0, BX
-       JNC align_2
-
-       CRC32B (SI), AX
-       DECQ   CX
-       INCQ   SI
-
-align_2:
-       BTQ $1, BX
-       JNC align_4
-
-       // CRC32W (SI), AX
-       BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-       SUBQ $2, CX
-       ADDQ $2, SI
-
-align_4:
-       BTQ $2, BX
-       JNC aligned
-
-       // CRC32L (SI), AX
-       BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-       SUBQ $4, CX
-       ADDQ $4, SI
-
-aligned:
-       // The input is now 8-byte aligned and we can process 8-byte chunks.
-       CMPQ CX, $8
-       JL   less_than_8
-
-       CRC32Q (SI), AX
-       ADDQ   $8, SI
-       SUBQ   $8, CX
-       JMP    aligned
-
-less_than_8:
-       // We may have some bytes left over; process 4 bytes, then 2, then 1.
-       BTQ $2, CX
-       JNC less_than_4
-
-       // CRC32L (SI), AX
-       BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-       ADDQ $4, SI
-
-less_than_4:
-       BTQ $1, CX
-       JNC less_than_2
-
-       // CRC32W (SI), AX
-       BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-       ADDQ $2, SI
-
-less_than_2:
-       BTQ $0, CX
-       JNC done
-
-       CRC32B (SI), AX
-
-done:
-       MOVL AX, ret+32(FP)
-       RET
-
-// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
-// bytes from each buffer.
-//
-// func castagnoliSSE42Triple(
-//     crc1, crc2, crc3 uint32,
-//     a, b, c []byte,
-//     rounds uint32,
-// ) (retA uint32, retB uint32, retC uint32)
-TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
-       MOVL crcA+0(FP), AX
-       MOVL crcB+4(FP), CX
-       MOVL crcC+8(FP), DX
-
-       MOVQ a+16(FP), R8  // data pointer
-       MOVQ b+40(FP), R9  // data pointer
-       MOVQ c+64(FP), R10 // data pointer
-
-       MOVL rounds+88(FP), R11
-
-loop:
-       CRC32Q (R8), AX
-       CRC32Q (R9), CX
-       CRC32Q (R10), DX
-
-       CRC32Q 8(R8), AX
-       CRC32Q 8(R9), CX
-       CRC32Q 8(R10), DX
-
-       CRC32Q 16(R8), AX
-       CRC32Q 16(R9), CX
-       CRC32Q 16(R10), DX
-
-       ADDQ $24, R8
-       ADDQ $24, R9
-       ADDQ $24, R10
-
-       DECQ R11
-       JNZ  loop
-
-       MOVL AX, retA+96(FP)
-       MOVL CX, retB+100(FP)
-       MOVL DX, retC+104(FP)
-       RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-       XORQ AX, AX
-       INCL AX
-       CPUID
-       SHRQ $20, CX
-       ANDQ $1, CX
-       MOVB CX, ret+0(FP)
-       RET
-
-// func haveCLMUL() bool
-TEXT ·haveCLMUL(SB), NOSPLIT, $0
-       XORQ AX, AX
-       INCL AX
-       CPUID
-       SHRQ $1, CX
-       ANDQ $1, CX
-       MOVB CX, ret+0(FP)
-       RET
-
-// func haveSSE41() bool
-TEXT ·haveSSE41(SB), NOSPLIT, $0
-       XORQ AX, AX
-       INCL AX
-       CPUID
-       SHRQ $19, CX
-       ANDQ $1, CX
-       MOVB CX, ret+0(FP)
-       RET
-
-// CRC32 polynomial data
-//
-// These constants are lifted from the
-// Linux kernel, since they avoid the costly
-// PSHUFB 16 byte reversal proposed in the
-// original Intel paper.
-DATA r2r1kp<>+0(SB)/8, $0x154442bd4
-DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
-DATA r4r3kp<>+0(SB)/8, $0x1751997d0
-DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
-DATA rupolykp<>+0(SB)/8, $0x1db710641
-DATA rupolykp<>+8(SB)/8, $0x1f7011641
-DATA r5kp<>+0(SB)/8, $0x163cd6124
-
-GLOBL r2r1kp<>(SB), RODATA, $16
-GLOBL r4r3kp<>(SB), RODATA, $16
-GLOBL rupolykp<>(SB), RODATA, $16
-GLOBL r5kp<>(SB), RODATA, $8
-
-// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-// len(p) must be at least 64, and must be a multiple of 16.
-
-// func ieeeCLMUL(crc uint32, p []byte) uint32
-TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
-       MOVL crc+0(FP), X0    // Initial CRC value
-       MOVQ p+8(FP), SI      // data pointer
-       MOVQ p_len+16(FP), CX // len(p)
-
-       MOVOU (SI), X1
-       MOVOU 16(SI), X2
-       MOVOU 32(SI), X3
-       MOVOU 48(SI), X4
-       PXOR  X0, X1
-       ADDQ  $64, SI    // buf+=64
-       SUBQ  $64, CX    // len-=64
-       CMPQ  CX, $64    // Less than 64 bytes left
-       JB    remain64
-
-       MOVOA r2r1kp<>+0(SB), X0
-
-loopback64:
-       MOVOA X1, X5
-       MOVOA X2, X6
-       MOVOA X3, X7
-       MOVOA X4, X8
-
-       PCLMULQDQ $0, X0, X1
-       PCLMULQDQ $0, X0, X2
-       PCLMULQDQ $0, X0, X3
-       PCLMULQDQ $0, X0, X4
-
-       // Load next early
-       MOVOU (SI), X11
-       MOVOU 16(SI), X12
-       MOVOU 32(SI), X13
-       MOVOU 48(SI), X14
-
-       PCLMULQDQ $0x11, X0, X5
-       PCLMULQDQ $0x11, X0, X6
-       PCLMULQDQ $0x11, X0, X7
-       PCLMULQDQ $0x11, X0, X8
-
-       PXOR X5, X1
-       PXOR X6, X2
-       PXOR X7, X3
-       PXOR X8, X4
-
-       PXOR X11, X1
-       PXOR X12, X2
-       PXOR X13, X3
-       PXOR X14, X4
-
-       ADDQ $0x40, DI
-       ADDQ $64, SI    // buf+=64
-       SUBQ $64, CX    // len-=64
-       CMPQ CX, $64    // Less than 64 bytes left?
-       JGE  loopback64
-
-       // Fold result into a single register (X1)
-remain64:
-       MOVOA r4r3kp<>+0(SB), X0
-
-       MOVOA     X1, X5
-       PCLMULQDQ $0, X0, X1
-       PCLMULQDQ $0x11, X0, X5
-       PXOR      X5, X1
-       PXOR      X2, X1
-
-       MOVOA     X1, X5
-       PCLMULQDQ $0, X0, X1
-       PCLMULQDQ $0x11, X0, X5
-       PXOR      X5, X1
-       PXOR      X3, X1
-
-       MOVOA     X1, X5
-       PCLMULQDQ $0, X0, X1
-       PCLMULQDQ $0x11, X0, X5
-       PXOR      X5, X1
-       PXOR      X4, X1
-
-       // If there is less than 16 bytes left we are done
-       CMPQ CX, $16
-       JB   finish
-
-       // Encode 16 bytes
-remain16:
-       MOVOU     (SI), X10
-       MOVOA     X1, X5
-       PCLMULQDQ $0, X0, X1
-       PCLMULQDQ $0x11, X0, X5
-       PXOR      X5, X1
-       PXOR      X10, X1
-       SUBQ      $16, CX
-       ADDQ      $16, SI
-       CMPQ      CX, $16
-       JGE       remain16
-
-finish:
-       // Fold final result into 32 bits and return it
-       PCMPEQB   X3, X3
-       PCLMULQDQ $1, X1, X0
-       PSRLDQ    $8, X1
-       PXOR      X0, X1
-
-       MOVOA X1, X2
-       MOVQ  r5kp<>+0(SB), X0
-
-       // Creates 32 bit mask. Note that we don't care about upper half.
-       PSRLQ $32, X3
-
-       PSRLDQ    $4, X2
-       PAND      X3, X1
-       PCLMULQDQ $0, X0, X1
-       PXOR      X2, X1
-
-       MOVOA rupolykp<>+0(SB), X0
-
-       MOVOA     X1, X2
-       PAND      X3, X1
-       PCLMULQDQ $0x10, X0, X1
-       PAND      X3, X1
-       PCLMULQDQ $0, X0, X1
-       PXOR      X2, X1
-
-       // PEXTRD   $1, X1, AX  (SSE 4.1)
-       BYTE $0x66; BYTE $0x0f; BYTE $0x3a
-       BYTE $0x16; BYTE $0xc8; BYTE $0x01
-       MOVL AX, ret+32(FP)
-
-       RET
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
deleted file mode 100644 (file)
index 3222b06..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-package crc32
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// CRC.
-
-// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
-// support.
-func haveSSE42() bool
-
-// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-
-func archAvailableCastagnoli() bool {
-       return sse42
-}
-
-func archInitCastagnoli() {
-       if !sse42 {
-               panic("not available")
-       }
-       // No initialization necessary.
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-       if !sse42 {
-               panic("not available")
-       }
-       return castagnoliSSE42(crc, p)
-}
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
deleted file mode 100644 (file)
index a578d68..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-       MOVL crc+0(FP), AX   // CRC value
-       MOVL p+4(FP), SI     // data pointer
-       MOVL p_len+8(FP), CX // len(p)
-
-       NOTL AX
-
-       // If there's less than 8 bytes to process, we do it byte-by-byte.
-       CMPQ CX, $8
-       JL   cleanup
-
-       // Process individual bytes until the input is 8-byte aligned.
-startup:
-       MOVQ SI, BX
-       ANDQ $7, BX
-       JZ   aligned
-
-       CRC32B (SI), AX
-       DECQ   CX
-       INCQ   SI
-       JMP    startup
-
-aligned:
-       // The input is now 8-byte aligned and we can process 8-byte chunks.
-       CMPQ CX, $8
-       JL   cleanup
-
-       CRC32Q (SI), AX
-       ADDQ   $8, SI
-       SUBQ   $8, CX
-       JMP    aligned
-
-cleanup:
-       // We may have some bytes left over that we process one at a time.
-       CMPQ CX, $0
-       JE   done
-
-       CRC32B (SI), AX
-       INCQ   SI
-       DECQ   CX
-       JMP    cleanup
-
-done:
-       NOTL AX
-       MOVL AX, ret+16(FP)
-       RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-       XORQ AX, AX
-       INCL AX
-       CPUID
-       SHRQ $20, CX
-       ANDQ $1, CX
-       MOVB CX, ret+0(FP)
-       RET
-
diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go
deleted file mode 100644 (file)
index abacbb6..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file contains CRC32 algorithms that are not specific to any architecture
-// and don't use hardware acceleration.
-//
-// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
-//
-// The slicing-by-8 algorithm is a faster implementation that uses a bigger
-// table (8*256*4 bytes).
-
-package crc32
-
-// simpleMakeTable allocates and constructs a Table for the specified
-// polynomial. The table is suitable for use with the simple algorithm
-// (simpleUpdate).
-func simpleMakeTable(poly uint32) *Table {
-       t := new(Table)
-       simplePopulateTable(poly, t)
-       return t
-}
-
-// simplePopulateTable constructs a Table for the specified polynomial, suitable
-// for use with simpleUpdate.
-func simplePopulateTable(poly uint32, t *Table) {
-       for i := 0; i < 256; i++ {
-               crc := uint32(i)
-               for j := 0; j < 8; j++ {
-                       if crc&1 == 1 {
-                               crc = (crc >> 1) ^ poly
-                       } else {
-                               crc >>= 1
-                       }
-               }
-               t[i] = crc
-       }
-}
-
-// simpleUpdate uses the simple algorithm to update the CRC, given a table that
-// was previously computed using simpleMakeTable.
-func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
-       crc = ^crc
-       for _, v := range p {
-               crc = tab[byte(crc)^v] ^ (crc >> 8)
-       }
-       return ^crc
-}
-
-// Use slicing-by-8 when payload >= this value.
-const slicing8Cutoff = 16
-
-// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
-type slicing8Table [8]Table
-
-// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
-// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
-func slicingMakeTable(poly uint32) *slicing8Table {
-       t := new(slicing8Table)
-       simplePopulateTable(poly, &t[0])
-       for i := 0; i < 256; i++ {
-               crc := t[0][i]
-               for j := 1; j < 8; j++ {
-                       crc = t[0][crc&0xFF] ^ (crc >> 8)
-                       t[j][i] = crc
-               }
-       }
-       return t
-}
-
-// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
-// table that was previously computed using slicingMakeTable.
-func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
-       if len(p) >= slicing8Cutoff {
-               crc = ^crc
-               for len(p) > 8 {
-                       crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-                       crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
-                               tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
-                               tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
-                       p = p[8:]
-               }
-               crc = ^crc
-       }
-       if len(p) == 0 {
-               return crc
-       }
-       return simpleUpdate(crc, &tab[0], p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go
deleted file mode 100644 (file)
index cc96076..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!amd64p32,!s390x
-
-package crc32
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
-
-func archAvailableCastagnoli() bool                    { return false }
-func archInitCastagnoli()                              { panic("not available") }
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go
deleted file mode 100644 (file)
index ce96f03..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-package crc32
-
-const (
-       vxMinLen    = 64
-       vxAlignMask = 15 // align to 16 bytes
-)
-
-// hasVectorFacility reports whether the machine has the z/Architecture
-// vector facility installed and enabled.
-func hasVectorFacility() bool
-
-var hasVX = hasVectorFacility()
-
-// vectorizedCastagnoli implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedCastagnoli(crc uint32, p []byte) uint32
-
-// vectorizedIEEE implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedIEEE(crc uint32, p []byte) uint32
-
-func archAvailableCastagnoli() bool {
-       return hasVX
-}
-
-var archCastagnoliTable8 *slicing8Table
-
-func archInitCastagnoli() {
-       if !hasVX {
-               panic("not available")
-       }
-       // We still use slicing-by-8 for small buffers.
-       archCastagnoliTable8 = slicingMakeTable(Castagnoli)
-}
-
-// archUpdateCastagnoli calculates the checksum of p using
-// vectorizedCastagnoli.
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-       if !hasVX {
-               panic("not available")
-       }
-       // Use vectorized function if data length is above threshold.
-       if len(p) >= vxMinLen {
-               aligned := len(p) & ^vxAlignMask
-               crc = vectorizedCastagnoli(crc, p[:aligned])
-               p = p[aligned:]
-       }
-       if len(p) == 0 {
-               return crc
-       }
-       return slicingUpdate(crc, archCastagnoliTable8, p)
-}
-
-func archAvailableIEEE() bool {
-       return hasVX
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-       if !hasVX {
-               panic("not available")
-       }
-       // We still use slicing-by-8 for small buffers.
-       archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-       if !hasVX {
-               panic("not available")
-       }
-       // Use vectorized function if data length is above threshold.
-       if len(p) >= vxMinLen {
-               aligned := len(p) & ^vxAlignMask
-               crc = vectorizedIEEE(crc, p[:aligned])
-               p = p[aligned:]
-       }
-       if len(p) == 0 {
-               return crc
-       }
-       return slicingUpdate(crc, archIeeeTable8, p)
-}
diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s
deleted file mode 100644 (file)
index e980ca2..0000000
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-#include "textflag.h"
-
-// Vector register range containing CRC-32 constants
-
-#define CONST_PERM_LE2BE        V9
-#define CONST_R2R1              V10
-#define CONST_R4R3              V11
-#define CONST_R5                V12
-#define CONST_RU_POLY           V13
-#define CONST_CRC_POLY          V14
-
-// The CRC-32 constant block contains reduction constants to fold and
-// process particular chunks of the input data stream in parallel.
-//
-// Note that the constant definitions below are extended in order to compute
-// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
-// The rightmost doubleword can be 0 to prevent contribution to the result or
-// can be multiplied by 1 to perform an XOR without the need for a separate
-// VECTOR EXCLUSIVE OR instruction.
-//
-// The polynomials used are bit-reflected:
-//
-//            IEEE: P'(x) = 0x0edb88320
-//      Castagnoli: P'(x) = 0x082f63b78
-
-// IEEE polynomial constants
-DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
-DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
-DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
-DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
-DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
-DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
-DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
-
-GLOBL ·crcleconskp(SB), RODATA, $144
-
-// Castagonli Polynomial constants
-DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
-DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
-DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
-DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
-DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
-DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
-DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
-
-GLOBL ·crccleconskp(SB), RODATA, $144
-
-// func hasVectorFacility() bool
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
-       MOVD  $x-24(SP), R1
-       XC    $24, 0(R1), 0(R1) // clear the storage
-       MOVD  $2, R0            // R0 is the number of double words stored -1
-       WORD  $0xB2B01000       // STFLE 0(R1)
-       XOR   R0, R0            // reset the value of R0
-       MOVBZ z-8(SP), R1
-       AND   $0x40, R1
-       BEQ   novector
-
-vectorinstalled:
-       // check if the vector instruction has been enabled
-       VLEIB  $0, $0xF, V16
-       VLGVB  $0, V16, R1
-       CMPBNE R1, $0xF, novector
-       MOVB   $1, ret+0(FP)      // have vx
-       RET
-
-novector:
-       MOVB $0, ret+0(FP) // no vx
-       RET
-
-// The CRC-32 function(s) use these calling conventions:
-//
-// Parameters:
-//
-//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
-//      R3:    Input buffer pointer, performance might be improved if the
-//             buffer is on a doubleword boundary.
-//      R4:    Length of the buffer, must be 64 bytes or greater.
-//
-// Register usage:
-//
-//      R5:     CRC-32 constant pool base pointer.
-//      V0:     Initial CRC value and intermediate constants and results.
-//      V1..V4: Data for CRC computation.
-//      V5..V8: Next data chunks that are fetched from the input buffer.
-//
-//      V9..V14: CRC-32 constants.
-
-// func vectorizedIEEE(crc uint32, p []byte) uint32
-TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
-       MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-       MOVD  p+8(FP), R3      // data pointer
-       MOVD  p_len+16(FP), R4 // len(p)
-
-       MOVD $·crcleconskp(SB), R5
-       BR   vectorizedBody<>(SB)
-
-// func vectorizedCastagnoli(crc uint32, p []byte) uint32
-TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
-       MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-       MOVD  p+8(FP), R3      // data pointer
-       MOVD  p_len+16(FP), R4 // len(p)
-
-       // R5: crc-32 constant pool base pointer, constant is used to reduce crc
-       MOVD $·crccleconskp(SB), R5
-       BR   vectorizedBody<>(SB)
-
-TEXT vectorizedBody<>(SB), NOSPLIT, $0
-       XOR $0xffffffff, R2                         // NOTW R2
-       VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
-
-       // Load the initial CRC value into the rightmost word of V0
-       VZERO V0
-       VLVGF $3, R2, V0
-
-       // Crash if the input size is less than 64-bytes.
-       CMP R4, $64
-       BLT crash
-
-       // Load a 64-byte data chunk and XOR with CRC
-       VLM 0(R3), V1, V4 // 64-bytes into V1..V4
-
-       // Reflect the data if the CRC operation is in the bit-reflected domain
-       VPERM V1, V1, CONST_PERM_LE2BE, V1
-       VPERM V2, V2, CONST_PERM_LE2BE, V2
-       VPERM V3, V3, CONST_PERM_LE2BE, V3
-       VPERM V4, V4, CONST_PERM_LE2BE, V4
-
-       VX  V0, V1, V1 // V1 ^= CRC
-       ADD $64, R3    // BUF = BUF + 64
-       ADD $(-64), R4
-
-       // Check remaining buffer size and jump to proper folding method
-       CMP R4, $64
-       BLT less_than_64bytes
-
-fold_64bytes_loop:
-       // Load the next 64-byte data chunk into V5 to V8
-       VLM   0(R3), V5, V8
-       VPERM V5, V5, CONST_PERM_LE2BE, V5
-       VPERM V6, V6, CONST_PERM_LE2BE, V6
-       VPERM V7, V7, CONST_PERM_LE2BE, V7
-       VPERM V8, V8, CONST_PERM_LE2BE, V8
-
-       // Perform a GF(2) multiplication of the doublewords in V1 with
-       // the reduction constants in V0.  The intermediate result is
-       // then folded (accumulated) with the next data chunk in V5 and
-       // stored in V1.  Repeat this step for the register contents
-       // in V2, V3, and V4 respectively.
-
-       VGFMAG CONST_R2R1, V1, V5, V1
-       VGFMAG CONST_R2R1, V2, V6, V2
-       VGFMAG CONST_R2R1, V3, V7, V3
-       VGFMAG CONST_R2R1, V4, V8, V4
-
-       // Adjust buffer pointer and length for next loop
-       ADD $64, R3    // BUF = BUF + 64
-       ADD $(-64), R4 // LEN = LEN - 64
-
-       CMP R4, $64
-       BGE fold_64bytes_loop
-
-less_than_64bytes:
-       // Fold V1 to V4 into a single 128-bit value in V1
-       VGFMAG CONST_R4R3, V1, V2, V1
-       VGFMAG CONST_R4R3, V1, V3, V1
-       VGFMAG CONST_R4R3, V1, V4, V1
-
-       // Check whether to continue with 64-bit folding
-       CMP R4, $16
-       BLT final_fold
-
-fold_16bytes_loop:
-       VL    0(R3), V2                    // Load next data chunk
-       VPERM V2, V2, CONST_PERM_LE2BE, V2
-
-       VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
-
-       // Adjust buffer pointer and size for folding next data chunk
-       ADD $16, R3
-       ADD $-16, R4
-
-       // Process remaining data chunks
-       CMP R4, $16
-       BGE fold_16bytes_loop
-
-final_fold:
-       VLEIB $7, $0x40, V9
-       VSRLB V9, CONST_R4R3, V0
-       VLEIG $0, $1, V0
-
-       VGFMG V0, V1, V1
-
-       VLEIB  $7, $0x20, V9        // Shift by words
-       VSRLB  V9, V1, V2           // Store remaining bits in V2
-       VUPLLF V1, V1               // Split rightmost doubleword
-       VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
-
-       // The input values to the Barret reduction are the degree-63 polynomial
-       // in V1 (R(x)), degree-32 generator polynomial, and the reduction
-       // constant u.  The Barret reduction result is the CRC value of R(x) mod
-       // P(x).
-       //
-       // The Barret reduction algorithm is defined as:
-       //
-       //    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
-       //    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
-       //    3. C(x)  = R(x) XOR T2(x) mod x^32
-       //
-       // Note: To compensate the division by x^32, use the vector unpack
-       // instruction to move the leftmost word into the leftmost doubleword
-       // of the vector register.  The rightmost doubleword is multiplied
-       // with zero to not contribute to the intermedate results.
-
-       // T1(x) = floor( R(x) / x^32 ) GF2MUL u
-       VUPLLF V1, V2
-       VGFMG  CONST_RU_POLY, V2, V2
-
-       // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
-       // V2 and XOR the intermediate result, T2(x),  with the value in V1.
-       // The final result is in the rightmost word of V2.
-
-       VUPLLF V2, V2
-       VGFMAG CONST_CRC_POLY, V2, V1, V2
-
-done:
-       VLGVF $2, V2, R2
-       XOR   $0xffffffff, R2  // NOTW R2
-       MOVWZ R2, ret + 32(FP)
-       RET
-
-crash:
-       MOVD $0, (R0) // input size is less than 64-bytes
index b159d7877d765340e3f39f67f72616c8074297ff..68834ae7fd38a73ab20107ba36ef03ddd5fb00c0 100644 (file)
@@ -12,6 +12,8 @@ gitea.com/macaron/captcha
 gitea.com/macaron/cors
 # gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
 gitea.com/macaron/csrf
+# gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5
+gitea.com/macaron/gzip
 # gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
 gitea.com/macaron/i18n
 # gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
@@ -259,13 +261,9 @@ github.com/keybase/go-crypto/openpgp/errors
 github.com/keybase/go-crypto/openpgp/packet
 github.com/keybase/go-crypto/openpgp/s2k
 github.com/keybase/go-crypto/rsa
-# github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f
+# github.com/klauspost/compress v1.9.2
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/gzip
-# github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc
-github.com/klauspost/cpuid
-# github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6
-github.com/klauspost/crc32
 # github.com/kr/pretty v0.1.0
 github.com/kr/pretty
 # github.com/kr/text v0.1.0