diff options
Diffstat (limited to 'vendor/github.com/Smerity')
-rw-r--r-- | vendor/github.com/Smerity/govarint/LICENSE | 22 | ||||
-rw-r--r-- | vendor/github.com/Smerity/govarint/README.md | 67 | ||||
-rw-r--r-- | vendor/github.com/Smerity/govarint/govarint.go | 229 |
3 files changed, 318 insertions, 0 deletions
diff --git a/vendor/github.com/Smerity/govarint/LICENSE b/vendor/github.com/Smerity/govarint/LICENSE new file mode 100644 index 0000000000..be09cac865 --- /dev/null +++ b/vendor/github.com/Smerity/govarint/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Stephen Merity + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/Smerity/govarint/README.md b/vendor/github.com/Smerity/govarint/README.md new file mode 100644 index 0000000000..5b82dccb2f --- /dev/null +++ b/vendor/github.com/Smerity/govarint/README.md @@ -0,0 +1,67 @@ +# Govarint + +This project aims to provide a simple API for the performant encoding and decoding of 32 and 64 bit integers using a variety of algorithms. + +[![](http://i.imgur.com/mpgC23U.jpg)](https://www.flickr.com/photos/tsevis/8648521649/) + +## Usage + +Each integer encoding algorithm conforms to an encoding and decoding interface. +The interfaces also specify the size of the unsigned integer, either 32 or 64 bits, and will be referred to as XX below. +To create an encoder: + + NewU32Base128Encoder(w io.Writer) + NewU64Base128Encoder(w io.Writer) + NewU32GroupVarintEncoder(w io.Writer) + +For encoders, the only two commands are `PutUXX` and `Close`. +`Close` must be called as some integer encoding algorithms write in multiples. + + var buf bytes.Buffer + enc := NewU32Base128Encoder(&buf) + enc.PutU32(117) + enc.PutU32(343) + enc.Close() + +To create a decoder: + + NewU32Base128Decoder(r io.ByteReader) + NewU64Base128Decoder(r io.ByteReader) + NewU32GroupVarintDecoder(r io.ByteReader) + +For decoders, the only command is `GetUXX`. +`GetUXX` returns the value and any potential errors. +When reading is complete, `GetUXX` will return an `EOF` (End Of File). + + dec := NewU32Base128Decoder(&buf) + x, err := dec.GetU32() + +## Use Cases + +Using fixed width integers, such as uint32 and uint64, usually waste large amounts of space, especially when encoding small values. +Optimally, smaller numbers should take less space to represent. + +Using integer encoding algorithms is especially common in specific applications, such as storing edge lists or indexes for search engines. +In these situations, you have a sorted list of numbers that you want to keep as compactly as possible in memory. +Additionally, by storing only the difference between the given number and the previous (delta encoding), the numbers are quite small, and thus compress well. + +For an explicit example, the Web Data Commons Hyperlink Graph contains 128 billion edges linking page A to page B, where each page is represented by a 32 bit integer. +By converting all these edges to 64 bit integers (32 | 32), sorting them, and then using delta encoding, memory usage can be reduced from 64 bits per edge down to only 9 bits per edge using the Base128 integer encoding algorithm. +This figure improves even further if compressed using conventional compression algorithms (3 bits per edge). + +## Encodings supported + +`govarint` supports: + ++ Base128 [32, 64] - each byte uses 7 bits for encoding the integer and 1 bit for indicating if the integer requires another byte ++ Group Varint [32] - integers are encoded in blocks of four - one byte encodes the size of the following four integers, then the values of the four integers follows + +Group Varint consistently beats Base128 in decompression speed but Base128 may offer improved compression ratios depending on the distribution of the supplied integers. + +## Tests + + go test -v -bench=. + +## License + +MIT License, as per `LICENSE` diff --git a/vendor/github.com/Smerity/govarint/govarint.go b/vendor/github.com/Smerity/govarint/govarint.go new file mode 100644 index 0000000000..61328a337b --- /dev/null +++ b/vendor/github.com/Smerity/govarint/govarint.go @@ -0,0 +1,229 @@ +package govarint + +import "encoding/binary" +import "io" + +type U32VarintEncoder interface { + PutU32(x uint32) int + Close() +} + +type U32VarintDecoder interface { + GetU32() (uint32, error) +} + +/// + +type U64VarintEncoder interface { + PutU64(x uint64) int + Close() +} + +type U64VarintDecoder interface { + GetU64() (uint64, error) +} + +/// + +type U32GroupVarintEncoder struct { + w io.Writer + index int + store [4]uint32 + temp [17]byte +} + +func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } + +func (b *U32GroupVarintEncoder) Flush() (int, error) { + // TODO: Is it more efficient to have a tailored version that's called only in Close()? + // If index is zero, there are no integers to flush + if b.index == 0 { + return 0, nil + } + // In the case we're flushing (the group isn't of size four), the non-values should be zero + // This ensures the unused entries are all zero in the sizeByte + for i := b.index; i < 4; i++ { + b.store[i] = 0 + } + length := 1 + // We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it + b.temp[0] = 0 + for i, x := range b.store { + size := byte(0) + shifts := []byte{24, 16, 8, 0} + for _, shift := range shifts { + // Always writes at least one byte -- the first one (shift = 0) + // Will write more bytes until the rest of the integer is all zeroes + if (x>>shift) != 0 || shift == 0 { + size += 1 + b.temp[length] = byte(x >> shift) + length += 1 + } + } + // We store the size in two of the eight bits in the first byte (sizeByte) + // 0 means there is one byte in total, hence why we subtract one from size + b.temp[0] |= (size - 1) << (uint8(3-i) * 2) + } + // If we're flushing without a full group of four, remove the unused bytes we computed + // This enables us to realize it's a partial group on decoding thanks to EOF + if b.index != 4 { + length -= 4 - b.index + } + _, err := b.w.Write(b.temp[:length]) + return length, err +} + +func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { + bytesWritten := 0 + b.store[b.index] = x + b.index += 1 + if b.index == 4 { + n, err := b.Flush() + if err != nil { + return n, err + } + bytesWritten += n + b.index = 0 + } + return bytesWritten, nil +} + +func (b *U32GroupVarintEncoder) Close() { + // On Close, we flush any remaining values that might not have been in a full group + b.Flush() +} + +/// + +type U32GroupVarintDecoder struct { + r io.ByteReader + group [4]uint32 + pos int + finished bool + capacity int +} + +func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { + return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} +} + +func (b *U32GroupVarintDecoder) getGroup() error { + // We should always receive a sizeByte if there are more values to read + sizeByte, err := b.r.ReadByte() + if err != nil { + return err + } + // Calculate the size of the four incoming 32 bit integers + // 0b00 means 1 byte to read, 0b01 = 2, etc + b.group[0] = uint32((sizeByte >> 6) & 3) + b.group[1] = uint32((sizeByte >> 4) & 3) + b.group[2] = uint32((sizeByte >> 2) & 3) + b.group[3] = uint32(sizeByte & 3) + // + for index, size := range b.group { + b.group[index] = 0 + // Any error that occurs in earlier byte reads should be repeated at the end one + // Hence we only catch and report the final ReadByte's error + var err error + switch size { + case 0: + var x byte + x, err = b.r.ReadByte() + b.group[index] = uint32(x) + case 1: + var x, y byte + x, _ = b.r.ReadByte() + y, err = b.r.ReadByte() + b.group[index] = uint32(x)<<8 | uint32(y) + case 2: + var x, y, z byte + x, _ = b.r.ReadByte() + y, _ = b.r.ReadByte() + z, err = b.r.ReadByte() + b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) + case 3: + var x, y, z, zz byte + x, _ = b.r.ReadByte() + y, _ = b.r.ReadByte() + z, _ = b.r.ReadByte() + zz, err = b.r.ReadByte() + b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) + } + if err != nil { + if err == io.EOF { + // If we hit EOF here, we have found a partial group + // We've return any valid entries we have read and return EOF once we run out + b.capacity = index + b.finished = true + break + } else { + return err + } + } + } + // Reset the pos pointer to the beginning of the read values + b.pos = 0 + return nil +} + +func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { + // Check if we have any more values to give out - if not, let's get them + if b.pos == b.capacity { + // If finished is set, there is nothing else to do + if b.finished { + return 0, io.EOF + } + err := b.getGroup() + if err != nil { + return 0, err + } + } + // Increment pointer and return the value stored at that point + b.pos += 1 + return b.group[b.pos-1], nil +} + +/// + +type Base128Encoder struct { + w io.Writer + tmpBytes []byte +} + +func NewU32Base128Encoder(w io.Writer) *Base128Encoder { + return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} +} +func NewU64Base128Encoder(w io.Writer) *Base128Encoder { + return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} +} + +func (b *Base128Encoder) PutU32(x uint32) (int, error) { + writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) + return b.w.Write(b.tmpBytes[:writtenBytes]) +} + +func (b *Base128Encoder) PutU64(x uint64) (int, error) { + writtenBytes := binary.PutUvarint(b.tmpBytes, x) + return b.w.Write(b.tmpBytes[:writtenBytes]) +} + +func (b *Base128Encoder) Close() { +} + +/// + +type Base128Decoder struct { + r io.ByteReader +} + +func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } +func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } + +func (b *Base128Decoder) GetU32() (uint32, error) { + v, err := binary.ReadUvarint(b.r) + return uint32(v), err +} + +func (b *Base128Decoder) GetU64() (uint64, error) { + return binary.ReadUvarint(b.r) +} |