diff options
Diffstat (limited to 'vendor/github.com/blevesearch/vellum/builder.go')
-rw-r--r-- | vendor/github.com/blevesearch/vellum/builder.go | 452 |
1 files changed, 452 insertions, 0 deletions
diff --git a/vendor/github.com/blevesearch/vellum/builder.go b/vendor/github.com/blevesearch/vellum/builder.go new file mode 100644 index 0000000000..f793329575 --- /dev/null +++ b/vendor/github.com/blevesearch/vellum/builder.go @@ -0,0 +1,452 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vellum + +import ( + "bytes" + "io" +) + +var defaultBuilderOpts = &BuilderOpts{ + Encoder: 1, + RegistryTableSize: 10000, + RegistryMRUSize: 2, +} + +// A Builder is used to build a new FST. When possible data is +// streamed out to the underlying Writer as soon as possible. +type Builder struct { + unfinished *unfinishedNodes + registry *registry + last []byte + len int + + lastAddr int + + encoder encoder + opts *BuilderOpts + + builderNodePool *builderNodePool +} + +const noneAddr = 1 +const emptyAddr = 0 + +// NewBuilder returns a new Builder which will stream out the +// underlying representation to the provided Writer as the set is built. +func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { + if opts == nil { + opts = defaultBuilderOpts + } + builderNodePool := &builderNodePool{} + rv := &Builder{ + unfinished: newUnfinishedNodes(builderNodePool), + registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize), + builderNodePool: builderNodePool, + opts: opts, + lastAddr: noneAddr, + } + + var err error + rv.encoder, err = loadEncoder(opts.Encoder, w) + if err != nil { + return nil, err + } + err = rv.encoder.start() + if err != nil { + return nil, err + } + return rv, nil +} + +func (b *Builder) Reset(w io.Writer) error { + b.unfinished.Reset() + b.registry.Reset() + b.lastAddr = noneAddr + b.encoder.reset(w) + b.last = nil + b.len = 0 + + err := b.encoder.start() + if err != nil { + return err + } + return nil +} + +// Insert the provided value to the set being built. +// NOTE: values must be inserted in lexicographical order. +func (b *Builder) Insert(key []byte, val uint64) error { + // ensure items are added in lexicographic order + if bytes.Compare(key, b.last) < 0 { + return ErrOutOfOrder + } + if len(key) == 0 { + b.len = 1 + b.unfinished.setRootOutput(val) + return nil + } + + prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val) + b.len++ + err := b.compileFrom(prefixLen) + if err != nil { + return err + } + b.copyLastKey(key) + b.unfinished.addSuffix(key[prefixLen:], out) + + return nil +} + +func (b *Builder) copyLastKey(key []byte) { + if b.last == nil { + b.last = make([]byte, 0, 64) + } else { + b.last = b.last[:0] + } + b.last = append(b.last, key...) +} + +// Close MUST be called after inserting all values. +func (b *Builder) Close() error { + err := b.compileFrom(0) + if err != nil { + return err + } + root := b.unfinished.popRoot() + rootAddr, err := b.compile(root) + if err != nil { + return err + } + return b.encoder.finish(b.len, rootAddr) +} + +func (b *Builder) compileFrom(iState int) error { + addr := noneAddr + for iState+1 < len(b.unfinished.stack) { + var node *builderNode + if addr == noneAddr { + node = b.unfinished.popEmpty() + } else { + node = b.unfinished.popFreeze(addr) + } + var err error + addr, err = b.compile(node) + if err != nil { + return nil + } + } + b.unfinished.topLastFreeze(addr) + return nil +} + +func (b *Builder) compile(node *builderNode) (int, error) { + if node.final && len(node.trans) == 0 && + node.finalOutput == 0 { + return 0, nil + } + found, addr, entry := b.registry.entry(node) + if found { + return addr, nil + } + addr, err := b.encoder.encodeState(node, b.lastAddr) + if err != nil { + return 0, err + } + + b.lastAddr = addr + entry.addr = addr + return addr, nil +} + +type unfinishedNodes struct { + stack []*builderNodeUnfinished + + // cache allocates a reasonable number of builderNodeUnfinished + // objects up front and tries to keep reusing them + // because the main data structure is a stack, we assume the + // same access pattern, and don't track items separately + // this means calls get() and pushXYZ() must be paired, + // as well as calls put() and popXYZ() + cache []builderNodeUnfinished + + builderNodePool *builderNodePool +} + +func (u *unfinishedNodes) Reset() { + u.stack = u.stack[:0] + for i := 0; i < len(u.cache); i++ { + u.cache[i] = builderNodeUnfinished{} + } + u.pushEmpty(false) +} + +func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { + rv := &unfinishedNodes{ + stack: make([]*builderNodeUnfinished, 0, 64), + cache: make([]builderNodeUnfinished, 64), + builderNodePool: p, + } + rv.pushEmpty(false) + return rv +} + +// get new builderNodeUnfinished, reusing cache if possible +func (u *unfinishedNodes) get() *builderNodeUnfinished { + if len(u.stack) < len(u.cache) { + return &u.cache[len(u.stack)] + } + // full now allocate a new one + return &builderNodeUnfinished{} +} + +// return builderNodeUnfinished, clearing it for reuse +func (u *unfinishedNodes) put() { + if len(u.stack) >= len(u.cache) { + return + // do nothing, not part of cache + } + u.cache[len(u.stack)] = builderNodeUnfinished{} +} + +func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte, + out uint64) (int, uint64) { + var i int + for i < len(key) { + if i >= len(u.stack) { + break + } + var addPrefix uint64 + if !u.stack[i].hasLastT { + break + } + if u.stack[i].lastIn == key[i] { + commonPre := outputPrefix(u.stack[i].lastOut, out) + addPrefix = outputSub(u.stack[i].lastOut, commonPre) + out = outputSub(out, commonPre) + u.stack[i].lastOut = commonPre + i++ + } else { + break + } + + if addPrefix != 0 { + u.stack[i].addOutputPrefix(addPrefix) + } + } + + return i, out +} + +func (u *unfinishedNodes) pushEmpty(final bool) { + next := u.get() + next.node = u.builderNodePool.Get() + next.node.final = final + u.stack = append(u.stack, next) +} + +func (u *unfinishedNodes) popRoot() *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) popFreeze(addr int) *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + unfinished.lastCompiled(addr) + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) popEmpty() *builderNode { + l := len(u.stack) + var unfinished *builderNodeUnfinished + u.stack, unfinished = u.stack[:l-1], u.stack[l-1] + rv := unfinished.node + u.put() + return rv +} + +func (u *unfinishedNodes) setRootOutput(out uint64) { + u.stack[0].node.final = true + u.stack[0].node.finalOutput = out +} + +func (u *unfinishedNodes) topLastFreeze(addr int) { + last := len(u.stack) - 1 + u.stack[last].lastCompiled(addr) +} + +func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) { + if len(bs) == 0 { + return + } + last := len(u.stack) - 1 + u.stack[last].hasLastT = true + u.stack[last].lastIn = bs[0] + u.stack[last].lastOut = out + for _, b := range bs[1:] { + next := u.get() + next.node = u.builderNodePool.Get() + next.hasLastT = true + next.lastIn = b + next.lastOut = 0 + u.stack = append(u.stack, next) + } + u.pushEmpty(true) +} + +type builderNodeUnfinished struct { + node *builderNode + lastOut uint64 + lastIn byte + hasLastT bool +} + +func (b *builderNodeUnfinished) lastCompiled(addr int) { + if b.hasLastT { + transIn := b.lastIn + transOut := b.lastOut + b.hasLastT = false + b.lastOut = 0 + b.node.trans = append(b.node.trans, transition{ + in: transIn, + out: transOut, + addr: addr, + }) + } +} + +func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { + if b.node.final { + b.node.finalOutput = outputCat(prefix, b.node.finalOutput) + } + for i := range b.node.trans { + b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out) + } + if b.hasLastT { + b.lastOut = outputCat(prefix, b.lastOut) + } +} + +type builderNode struct { + finalOutput uint64 + trans []transition + final bool + + // intrusive linked list + next *builderNode +} + +// reset resets the receiver builderNode to a re-usable state. +func (n *builderNode) reset() { + n.final = false + n.finalOutput = 0 + for i := range n.trans { + n.trans[i] = emptyTransition + } + n.trans = n.trans[:0] + n.next = nil +} + +func (n *builderNode) equiv(o *builderNode) bool { + if n.final != o.final { + return false + } + if n.finalOutput != o.finalOutput { + return false + } + if len(n.trans) != len(o.trans) { + return false + } + for i, ntrans := range n.trans { + otrans := o.trans[i] + if ntrans.in != otrans.in { + return false + } + if ntrans.addr != otrans.addr { + return false + } + if ntrans.out != otrans.out { + return false + } + } + return true +} + +var emptyTransition = transition{} + +type transition struct { + out uint64 + addr int + in byte +} + +func outputPrefix(l, r uint64) uint64 { + if l < r { + return l + } + return r +} + +func outputSub(l, r uint64) uint64 { + return l - r +} + +func outputCat(l, r uint64) uint64 { + return l + r +} + +// builderNodePool pools builderNodes using a singly linked list. +// +// NB: builderNode lifecylce is described by the following interactions - +// +------------------------+ +----------------------+ +// | Unfinished Nodes | Transfer once | Registry | +// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | +// +------------------------+ marked frozen +----------------------+ +// ^ | +// | | +// | Put() +// | Get() on +-------------------+ when +// +-new char--------| builderNode Pool |<-----------evicted +// +-------------------+ +type builderNodePool struct { + head *builderNode +} + +func (p *builderNodePool) Get() *builderNode { + if p.head == nil { + return &builderNode{} + } + head := p.head + p.head = p.head.next + return head +} + +func (p *builderNodePool) Put(v *builderNode) { + if v == nil { + return + } + v.reset() + v.next = p.head + p.head = v +} |