diff options
author | Antoine GIRARD <sapk@users.noreply.github.com> | 2018-05-19 14:49:46 +0200 |
---|---|---|
committer | Lunny Xiao <xiaolunwen@gmail.com> | 2018-05-19 20:49:46 +0800 |
commit | 917b9641eca3fa1b1676ba1b4fd77a4e958ee153 (patch) | |
tree | 2caf049dfebccf5ccbc44316630a6c9220062d78 /vendor/github.com/couchbase/vellum/builder.go | |
parent | 1b7cd3d0b0d3652e0660489b9c4da72619400c98 (diff) | |
download | gitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.tar.gz gitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.zip |
Update to last common bleve (#3986)
Diffstat (limited to 'vendor/github.com/couchbase/vellum/builder.go')
-rw-r--r-- | vendor/github.com/couchbase/vellum/builder.go | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/builder.go b/vendor/github.com/couchbase/vellum/builder.go new file mode 100644 index 0000000000..b21db98072 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/builder.go @@ -0,0 +1,453 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vellum + +import ( + "bytes" + "io" +) + +var defaultBuilderOpts = &BuilderOpts{ + Encoder: 1, + RegistryTableSize: 10000, + RegistryMRUSize: 2, +} + +// A Builder is used to build a new FST. When possible data is +// streamed out to the underlying Writer as soon as possible. +type Builder struct { + unfinished *unfinishedNodes + registry *registry + last []byte + len int + + lastAddr int + + encoder encoder + opts *BuilderOpts + + builderNodePool builderNodePool + transitionPool transitionPool +} + +const noneAddr = 1 +const emptyAddr = 0 + +// NewBuilder returns a new Builder which will stream out the +// underlying representation to the provided Writer as the set is built. +func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { + if opts == nil { + opts = defaultBuilderOpts + } + rv := &Builder{ + registry: newRegistry(opts.RegistryTableSize, opts.RegistryMRUSize), + opts: opts, + lastAddr: noneAddr, + } + rv.unfinished = newUnfinishedNodes(&rv.builderNodePool) + + var err error + rv.encoder, err = loadEncoder(opts.Encoder, w) + if err != nil { + return nil, err + } + err = rv.encoder.start() + if err != nil { + return nil, err + } + return rv, nil +} + +func (b *Builder) Reset(w io.Writer) error { + b.transitionPool.reset() + b.builderNodePool.reset() + b.unfinished.Reset(&b.builderNodePool) + b.registry.Reset() + b.lastAddr = noneAddr + b.encoder.reset(w) + b.last = nil + b.len = 0 + + err := b.encoder.start() + if err != nil { + return err + } + return nil +} + +// Insert the provided value to the set being built. +// NOTE: values must be inserted in lexicographical order. +func (b *Builder) Insert(key []byte, val uint64) error { + // ensure items are added in lexicographic order + if bytes.Compare(key, b.last) < 0 { + return ErrOutOfOrder + } + if len(key) == 0 { + b.len = 1 + b.unfinished.setRootOutput(val) + return nil + } + + prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val) + b.len++ + err := b.compileFrom(prefixLen) + if err != nil { + return err + } + b.copyLastKey(key) + b.unfinished.addSuffix(key[prefixLen:], out, &b.builderNodePool) + + return nil +} + +func (b *Builder) copyLastKey(key []byte) { + if b.last == nil { + b.last = make([]byte, 0, 64) + } else { + b.last = b.last[:0] + } + b.last = append(b.last, key...) +} + +// Close MUST be called after inserting all values. +func (b *Builder) Close() error { + err := b.compileFrom(0) + if err != nil { + return err + } + root := b.unfinished.popRoot() + rootAddr, err := b.compile(root) + if err != nil { + return err + } + return b.encoder.finish(b.len, rootAddr) +} + +func (b *Builder) compileFrom(iState int) error { + addr := noneAddr + for iState+1 < len(b.unfinished.stack) { + var node *builderNode + if addr == noneAddr { + node = b.unfinished.popEmpty() + } else { + node = b.unfinished.popFreeze(addr, &b.transitionPool) + } + var err error + addr, err = b.compile(node) + if err != nil { + return nil + } + } + b.unfinished.topLastFreeze(addr, &b.transitionPool) + return nil +} + +func (b *Builder) compile(node *builderNode) (int, error) { + if node.final && len(node.trans) == 0 && + node.finalOutput == 0 { + return 0, nil + } + found, addr, entry := b.registry.entry(node) + if found { + return addr, nil + } + addr, err := b.encoder.encodeState(node, b.lastAddr) + if err != nil { + return 0, err + } + + b.lastAddr = addr + entry.addr = addr + return addr, nil +} + +type unfinishedNodes struct { + stack []*builderNodeUnfinished + + // cache allocates a reasonable number of builderNodeUnfinished + // objects up front and tries to keep reusing them + // because the main data structure is a stack, we assume the + // same access pattern, and don't track items separately + // this means calls get() and pushXYZ() must be paired, + // as well as calls put() and popXYZ() + cache []builderNodeUnfinished +} + +func (u *unfinishedNodes) Reset(p *builderNodePool) { + u.stack = u.stack[:0] + for i := 0; i < len(u.cache); i++ { + u.cache[i] = builderNodeUnfinished{} + } + u.pushEmpty(false, p) +} + +func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { + rv := &unfinishedNodes{ + stack: make([]*builderNodeUnfinished, 0, 64), + cache: make([]builderNodeUnfinished, 64), + } + rv.pushEmpty(false, p) + return rv +} + +// get new builderNodeUnfinished, reusing cache if possible +func (u *unfinishedNodes) get() *builderNodeUnfinished { + if len(u.stack) < len(u.cache) { + return &u.cache[len(u.stack)] + } + // full now allocate a new one + return &builderNodeUnfinished{} +} + +// return builderNodeUnfinished, clearing it for reuse +func (u *unfinishedNodes) put() { + if len(u.stack) >= len(u.cache) { + return + // do nothing, not part of cache + } + u.cache[len(u.stack)] = builderNodeUnfinished{} +} + +func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte, + out uint64) (int, uint64) { + var i int + for i < len(key) { + if i >= len(u.stack) { + break + } + var addPrefix uint64 + if !u.stack[i].hasLastT { + break + } + if u.stack[i].lastIn == key[i] { + commonPre := outputPrefix(u.stack[i].lastOut, out) + addPrefix = outputSub(u.stack[i].lastOut, commonPre) + out = outputSub(out, commonPre) + u.stack[i].lastOut = commonPre + i++ + } else { + break + } + + if addPrefix != 0 { + u.stack[i].addOutputPrefix(addPrefix) + } + } + + return i, out +} + +func (u *unfinishedNodes) pushEmpty(final bool, p *builderNodePool) { + next := u.get() + next.node = p.alloc() + next.node.final = final + u.stack = append(u.stack, next) +} + +func (u *unfinishedNodes) popRoot() *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) popFreeze(addr int, tp *transitionPool) *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + unfinished.lastCompiled(addr, tp) + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) popEmpty() *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) setRootOutput(out uint64) { + u.stack[0].node.final = true + u.stack[0].node.finalOutput = out +} + +func (u *unfinishedNodes) topLastFreeze(addr int, tp *transitionPool) { + last := len(u.stack) - 1 + u.stack[last].lastCompiled(addr, tp) +} + +func (u *unfinishedNodes) addSuffix(bs []byte, out uint64, p *builderNodePool) { + if len(bs) == 0 { + return + } + last := len(u.stack) - 1 + u.stack[last].hasLastT = true + u.stack[last].lastIn = bs[0] + u.stack[last].lastOut = out + for _, b := range bs[1:] { + next := u.get() + next.node = p.alloc() + next.hasLastT = true + next.lastIn = b + next.lastOut = 0 + u.stack = append(u.stack, next) + } + u.pushEmpty(true, p) +} + +type builderNodeUnfinished struct { + node *builderNode + lastOut uint64 + lastIn byte + hasLastT bool +} + +func (b *builderNodeUnfinished) lastCompiled(addr int, tp *transitionPool) { + if b.hasLastT { + transIn := b.lastIn + transOut := b.lastOut + b.hasLastT = false + b.lastOut = 0 + trans := tp.alloc() + trans.in = transIn + trans.out = transOut + trans.addr = addr + b.node.trans = append(b.node.trans, trans) + } +} + +func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { + if b.node.final { + b.node.finalOutput = outputCat(prefix, b.node.finalOutput) + } + for _, t := range b.node.trans { + t.out = outputCat(prefix, t.out) + } + if b.hasLastT { + b.lastOut = outputCat(prefix, b.lastOut) + } +} + +type builderNode struct { + finalOutput uint64 + trans []*transition + final bool +} + +func (n *builderNode) equiv(o *builderNode) bool { + if n.final != o.final { + return false + } + if n.finalOutput != o.finalOutput { + return false + } + if len(n.trans) != len(o.trans) { + return false + } + for i, ntrans := range n.trans { + otrans := o.trans[i] + if ntrans.in != otrans.in { + return false + } + if ntrans.addr != otrans.addr { + return false + } + if ntrans.out != otrans.out { + return false + } + } + return true +} + +type transition struct { + out uint64 + addr int + in byte +} + +func outputPrefix(l, r uint64) uint64 { + if l < r { + return l + } + return r +} + +func outputSub(l, r uint64) uint64 { + return l - r +} + +func outputCat(l, r uint64) uint64 { + return l + r +} + +// the next builderNode to alloc() will be all[nextOuter][nextInner] +type builderNodePool struct { + all [][]builderNode + nextOuter int + nextInner int +} + +func (p *builderNodePool) reset() { + p.nextOuter = 0 + p.nextInner = 0 +} + +func (p *builderNodePool) alloc() *builderNode { + if p.nextOuter >= len(p.all) { + p.all = append(p.all, make([]builderNode, 256)) + } + rv := &p.all[p.nextOuter][p.nextInner] + p.nextInner += 1 + if p.nextInner >= len(p.all[p.nextOuter]) { + p.nextOuter += 1 + p.nextInner = 0 + } + rv.finalOutput = 0 + rv.trans = rv.trans[:0] + rv.final = false + return rv +} + +// the next transition to alloc() will be all[nextOuter][nextInner] +type transitionPool struct { + all [][]transition + nextOuter int + nextInner int +} + +func (p *transitionPool) reset() { + p.nextOuter = 0 + p.nextInner = 0 +} + +func (p *transitionPool) alloc() *transition { + if p.nextOuter >= len(p.all) { + p.all = append(p.all, make([]transition, 256)) + } + rv := &p.all[p.nextOuter][p.nextInner] + p.nextInner += 1 + if p.nextInner >= len(p.all[p.nextOuter]) { + p.nextOuter += 1 + p.nextInner = 0 + } + *rv = transition{} + return rv +} |