diff options
author | Lunny Xiao <xiaolunwen@gmail.com> | 2019-02-18 08:50:26 +0800 |
---|---|---|
committer | techknowlogick <matti@mdranta.net> | 2019-02-17 19:50:26 -0500 |
commit | a380cfd8e03148a05859a7496d235fa14bde4796 (patch) | |
tree | 9ef2f4b66804e73e242d0d07fd30769898a0ca23 /vendor/github.com/couchbase | |
parent | 11e316654e523bd668a20e1e6a95da3f5b9b22de (diff) | |
download | gitea-a380cfd8e03148a05859a7496d235fa14bde4796.tar.gz gitea-a380cfd8e03148a05859a7496d235fa14bde4796.zip |
Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2
* remove unused pkg from dep file
* change bleve from master to recent revision
Diffstat (limited to 'vendor/github.com/couchbase')
17 files changed, 1637 insertions, 255 deletions
diff --git a/vendor/github.com/couchbase/vellum/automaton.go b/vendor/github.com/couchbase/vellum/automaton.go index 47526595bc..70398f2d47 100644 --- a/vendor/github.com/couchbase/vellum/automaton.go +++ b/vendor/github.com/couchbase/vellum/automaton.go @@ -81,5 +81,5 @@ func (m *AlwaysMatch) Accept(int, byte) int { return 0 } -// creating an alwaysMatchAutomaton to avoid unnecesary repeated allocations. +// creating an alwaysMatchAutomaton to avoid unnecessary repeated allocations. var alwaysMatchAutomaton = &AlwaysMatch{} diff --git a/vendor/github.com/couchbase/vellum/builder.go b/vendor/github.com/couchbase/vellum/builder.go index b21db98072..f793329575 100644 --- a/vendor/github.com/couchbase/vellum/builder.go +++ b/vendor/github.com/couchbase/vellum/builder.go @@ -38,8 +38,7 @@ type Builder struct { encoder encoder opts *BuilderOpts - builderNodePool builderNodePool - transitionPool transitionPool + builderNodePool *builderNodePool } const noneAddr = 1 @@ -51,12 +50,14 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { if opts == nil { opts = defaultBuilderOpts } + builderNodePool := &builderNodePool{} rv := &Builder{ - registry: newRegistry(opts.RegistryTableSize, opts.RegistryMRUSize), - opts: opts, - lastAddr: noneAddr, + unfinished: newUnfinishedNodes(builderNodePool), + registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize), + builderNodePool: builderNodePool, + opts: opts, + lastAddr: noneAddr, } - rv.unfinished = newUnfinishedNodes(&rv.builderNodePool) var err error rv.encoder, err = loadEncoder(opts.Encoder, w) @@ -71,9 +72,7 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { } func (b *Builder) Reset(w io.Writer) error { - b.transitionPool.reset() - b.builderNodePool.reset() - b.unfinished.Reset(&b.builderNodePool) + b.unfinished.Reset() b.registry.Reset() b.lastAddr = noneAddr b.encoder.reset(w) @@ -107,7 +106,7 @@ func (b *Builder) Insert(key []byte, val uint64) error { return err } b.copyLastKey(key) - b.unfinished.addSuffix(key[prefixLen:], out, &b.builderNodePool) + b.unfinished.addSuffix(key[prefixLen:], out) return nil } @@ -142,7 +141,7 @@ func (b *Builder) compileFrom(iState int) error { if addr == noneAddr { node = b.unfinished.popEmpty() } else { - node = b.unfinished.popFreeze(addr, &b.transitionPool) + node = b.unfinished.popFreeze(addr) } var err error addr, err = b.compile(node) @@ -150,7 +149,7 @@ func (b *Builder) compileFrom(iState int) error { return nil } } - b.unfinished.topLastFreeze(addr, &b.transitionPool) + b.unfinished.topLastFreeze(addr) return nil } @@ -183,22 +182,25 @@ type unfinishedNodes struct { // this means calls get() and pushXYZ() must be paired, // as well as calls put() and popXYZ() cache []builderNodeUnfinished + + builderNodePool *builderNodePool } -func (u *unfinishedNodes) Reset(p *builderNodePool) { +func (u *unfinishedNodes) Reset() { u.stack = u.stack[:0] for i := 0; i < len(u.cache); i++ { u.cache[i] = builderNodeUnfinished{} } - u.pushEmpty(false, p) + u.pushEmpty(false) } func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { rv := &unfinishedNodes{ - stack: make([]*builderNodeUnfinished, 0, 64), - cache: make([]builderNodeUnfinished, 64), + stack: make([]*builderNodeUnfinished, 0, 64), + cache: make([]builderNodeUnfinished, 64), + builderNodePool: p, } - rv.pushEmpty(false, p) + rv.pushEmpty(false) return rv } @@ -249,9 +251,9 @@ func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte, return i, out } -func (u *unfinishedNodes) pushEmpty(final bool, p *builderNodePool) { +func (u *unfinishedNodes) pushEmpty(final bool) { next := u.get() - next.node = p.alloc() + next.node = u.builderNodePool.Get() next.node.final = final u.stack = append(u.stack, next) } @@ -265,11 +267,11 @@ func (u *unfinishedNodes) popRoot() *builderNode { return rv } -func (u *unfinishedNodes) popFreeze(addr int, tp *transitionPool) *builderNode { +func (u *unfinishedNodes) popFreeze(addr int) *builderNode { l := len(u.stack) var unfinished *builderNodeUnfinished u.stack, unfinished = u.stack[:l-1], u.stack[l-1] - unfinished.lastCompiled(addr, tp) + unfinished.lastCompiled(addr) rv := unfinished.node u.put() return rv @@ -289,12 +291,12 @@ func (u *unfinishedNodes) setRootOutput(out uint64) { u.stack[0].node.finalOutput = out } -func (u *unfinishedNodes) topLastFreeze(addr int, tp *transitionPool) { +func (u *unfinishedNodes) topLastFreeze(addr int) { last := len(u.stack) - 1 - u.stack[last].lastCompiled(addr, tp) + u.stack[last].lastCompiled(addr) } -func (u *unfinishedNodes) addSuffix(bs []byte, out uint64, p *builderNodePool) { +func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) { if len(bs) == 0 { return } @@ -304,13 +306,13 @@ func (u *unfinishedNodes) addSuffix(bs []byte, out uint64, p *builderNodePool) { u.stack[last].lastOut = out for _, b := range bs[1:] { next := u.get() - next.node = p.alloc() + next.node = u.builderNodePool.Get() next.hasLastT = true next.lastIn = b next.lastOut = 0 u.stack = append(u.stack, next) } - u.pushEmpty(true, p) + u.pushEmpty(true) } type builderNodeUnfinished struct { @@ -320,17 +322,17 @@ type builderNodeUnfinished struct { hasLastT bool } -func (b *builderNodeUnfinished) lastCompiled(addr int, tp *transitionPool) { +func (b *builderNodeUnfinished) lastCompiled(addr int) { if b.hasLastT { transIn := b.lastIn transOut := b.lastOut b.hasLastT = false b.lastOut = 0 - trans := tp.alloc() - trans.in = transIn - trans.out = transOut - trans.addr = addr - b.node.trans = append(b.node.trans, trans) + b.node.trans = append(b.node.trans, transition{ + in: transIn, + out: transOut, + addr: addr, + }) } } @@ -338,8 +340,8 @@ func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { if b.node.final { b.node.finalOutput = outputCat(prefix, b.node.finalOutput) } - for _, t := range b.node.trans { - t.out = outputCat(prefix, t.out) + for i := range b.node.trans { + b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out) } if b.hasLastT { b.lastOut = outputCat(prefix, b.lastOut) @@ -348,8 +350,22 @@ func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { type builderNode struct { finalOutput uint64 - trans []*transition + trans []transition final bool + + // intrusive linked list + next *builderNode +} + +// reset resets the receiver builderNode to a re-usable state. +func (n *builderNode) reset() { + n.final = false + n.finalOutput = 0 + for i := range n.trans { + n.trans[i] = emptyTransition + } + n.trans = n.trans[:0] + n.next = nil } func (n *builderNode) equiv(o *builderNode) bool { @@ -377,6 +393,8 @@ func (n *builderNode) equiv(o *builderNode) bool { return true } +var emptyTransition = transition{} + type transition struct { out uint64 addr int @@ -398,56 +416,37 @@ func outputCat(l, r uint64) uint64 { return l + r } -// the next builderNode to alloc() will be all[nextOuter][nextInner] +// builderNodePool pools builderNodes using a singly linked list. +// +// NB: builderNode lifecylce is described by the following interactions - +// +------------------------+ +----------------------+ +// | Unfinished Nodes | Transfer once | Registry | +// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | +// +------------------------+ marked frozen +----------------------+ +// ^ | +// | | +// | Put() +// | Get() on +-------------------+ when +// +-new char--------| builderNode Pool |<-----------evicted +// +-------------------+ type builderNodePool struct { - all [][]builderNode - nextOuter int - nextInner int -} - -func (p *builderNodePool) reset() { - p.nextOuter = 0 - p.nextInner = 0 + head *builderNode } -func (p *builderNodePool) alloc() *builderNode { - if p.nextOuter >= len(p.all) { - p.all = append(p.all, make([]builderNode, 256)) +func (p *builderNodePool) Get() *builderNode { + if p.head == nil { + return &builderNode{} } - rv := &p.all[p.nextOuter][p.nextInner] - p.nextInner += 1 - if p.nextInner >= len(p.all[p.nextOuter]) { - p.nextOuter += 1 - p.nextInner = 0 - } - rv.finalOutput = 0 - rv.trans = rv.trans[:0] - rv.final = false - return rv + head := p.head + p.head = p.head.next + return head } -// the next transition to alloc() will be all[nextOuter][nextInner] -type transitionPool struct { - all [][]transition - nextOuter int - nextInner int -} - -func (p *transitionPool) reset() { - p.nextOuter = 0 - p.nextInner = 0 -} - -func (p *transitionPool) alloc() *transition { - if p.nextOuter >= len(p.all) { - p.all = append(p.all, make([]transition, 256)) - } - rv := &p.all[p.nextOuter][p.nextInner] - p.nextInner += 1 - if p.nextInner >= len(p.all[p.nextOuter]) { - p.nextOuter += 1 - p.nextInner = 0 +func (p *builderNodePool) Put(v *builderNode) { + if v == nil { + return } - *rv = transition{} - return rv + v.reset() + v.next = p.head + p.head = v } diff --git a/vendor/github.com/couchbase/vellum/decoder_v1.go b/vendor/github.com/couchbase/vellum/decoder_v1.go index 5a0ea68871..d56e61db58 100644 --- a/vendor/github.com/couchbase/vellum/decoder_v1.go +++ b/vendor/github.com/couchbase/vellum/decoder_v1.go @@ -29,8 +29,6 @@ func init() { type decoderV1 struct { data []byte - root uint64 - len uint64 } func newDecoderV1(data []byte) *decoderV1 { @@ -219,7 +217,7 @@ func (f *fstStateV1) Final() bool { } func (f *fstStateV1) FinalOutput() uint64 { - if f.numTrans > 0 && f.final && f.outSize > 0 { + if f.final && f.outSize > 0 { return readPackedUint(f.data[f.outFinal : f.outFinal+f.outSize]) } return 0 diff --git a/vendor/github.com/couchbase/vellum/fst.go b/vendor/github.com/couchbase/vellum/fst.go index ecc528395c..64ee21a410 100644 --- a/vendor/github.com/couchbase/vellum/fst.go +++ b/vendor/github.com/couchbase/vellum/fst.go @@ -74,8 +74,8 @@ func (f *FST) get(input []byte, prealloc fstState) (uint64, bool, error) { if err != nil { return 0, false, err } - for i := range input { - _, curr, output := state.TransitionFor(input[i]) + for _, c := range input { + _, curr, output := state.TransitionFor(c) if curr == noneAddr { return 0, false, nil } @@ -243,6 +243,52 @@ func (f *FST) Reader() (*Reader, error) { return &Reader{f: f}, nil } +func (f *FST) GetMinKey() ([]byte, error) { + var rv []byte + + curr := f.decoder.getRoot() + state, err := f.decoder.stateAt(curr, nil) + if err != nil { + return nil, err + } + + for !state.Final() { + nextTrans := state.TransitionAt(0) + _, curr, _ = state.TransitionFor(nextTrans) + state, err = f.decoder.stateAt(curr, state) + if err != nil { + return nil, err + } + + rv = append(rv, nextTrans) + } + + return rv, nil +} + +func (f *FST) GetMaxKey() ([]byte, error) { + var rv []byte + + curr := f.decoder.getRoot() + state, err := f.decoder.stateAt(curr, nil) + if err != nil { + return nil, err + } + + for state.NumTransitions() > 0 { + nextTrans := state.TransitionAt(state.NumTransitions() - 1) + _, curr, _ = state.TransitionFor(nextTrans) + state, err = f.decoder.stateAt(curr, state) + if err != nil { + return nil, err + } + + rv = append(rv, nextTrans) + } + + return rv, nil +} + // A Reader is meant for a single threaded use type Reader struct { f *FST diff --git a/vendor/github.com/couchbase/vellum/fst_iterator.go b/vendor/github.com/couchbase/vellum/fst_iterator.go index 389ac64aab..eb731395b2 100644 --- a/vendor/github.com/couchbase/vellum/fst_iterator.go +++ b/vendor/github.com/couchbase/vellum/fst_iterator.go @@ -76,7 +76,8 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, // Reset resets the Iterator' internal state to allow for iterator // reuse (e.g. pooling). -func (i *FSTIterator) Reset(f *FST, startKeyInclusive, endKeyExclusive []byte, aut Automaton) error { +func (i *FSTIterator) Reset(f *FST, + startKeyInclusive, endKeyExclusive []byte, aut Automaton) error { if aut == nil { aut = alwaysMatchAutomaton } @@ -91,14 +92,14 @@ func (i *FSTIterator) Reset(f *FST, startKeyInclusive, endKeyExclusive []byte, a // pointTo attempts to point us to the specified location func (i *FSTIterator) pointTo(key []byte) error { - // tried to seek before start if bytes.Compare(key, i.startKeyInclusive) < 0 { key = i.startKeyInclusive } - // trid to see past end - if i.endKeyExclusive != nil && bytes.Compare(key, i.endKeyExclusive) > 0 { + // tried to see past end + if i.endKeyExclusive != nil && + bytes.Compare(key, i.endKeyExclusive) > 0 { key = i.endKeyExclusive } @@ -121,21 +122,23 @@ func (i *FSTIterator) pointTo(key []byte) error { i.statesStack = append(i.statesStack, root) i.autStatesStack = append(i.autStatesStack, autStart) for j := 0; j < len(key); j++ { + keyJ := key[j] curr := i.statesStack[len(i.statesStack)-1] autCurr := i.autStatesStack[len(i.autStatesStack)-1] - pos, nextAddr, nextVal := curr.TransitionFor(key[j]) + pos, nextAddr, nextVal := curr.TransitionFor(keyJ) if nextAddr == noneAddr { // needed transition doesn't exist // find last trans before the one we needed - for q := 0; q < curr.NumTransitions(); q++ { - if curr.TransitionAt(q) < key[j] { + for q := curr.NumTransitions() - 1; q >= 0; q-- { + if curr.TransitionAt(q) < keyJ { maxQ = q + break } } break } - autNext := i.aut.Accept(autCurr, key[j]) + autNext := i.aut.Accept(autCurr, keyJ) next, err := i.f.decoder.stateAt(nextAddr, nil) if err != nil { @@ -143,14 +146,16 @@ func (i *FSTIterator) pointTo(key []byte) error { } i.statesStack = append(i.statesStack, next) - i.keysStack = append(i.keysStack, key[j]) + i.keysStack = append(i.keysStack, keyJ) i.keysPosStack = append(i.keysPosStack, pos) i.valsStack = append(i.valsStack, nextVal) i.autStatesStack = append(i.autStatesStack, autNext) continue } - if !i.statesStack[len(i.statesStack)-1].Final() || !i.aut.IsMatch(i.autStatesStack[len(i.autStatesStack)-1]) || bytes.Compare(i.keysStack, key) < 0 { + if !i.statesStack[len(i.statesStack)-1].Final() || + !i.aut.IsMatch(i.autStatesStack[len(i.autStatesStack)-1]) || + bytes.Compare(i.keysStack, key) < 0 { return i.next(maxQ) } @@ -181,15 +186,12 @@ func (i *FSTIterator) Next() error { } func (i *FSTIterator) next(lastOffset int) error { - // remember where we started - if cap(i.nextStart) < len(i.keysStack) { - i.nextStart = make([]byte, len(i.keysStack)) - } else { - i.nextStart = i.nextStart[0:len(i.keysStack)] - } - copy(i.nextStart, i.keysStack) + i.nextStart = append(i.nextStart[:0], i.keysStack...) + nextOffset := lastOffset + 1 + +OUTER: for true { curr := i.statesStack[len(i.statesStack)-1] autCurr := i.autStatesStack[len(i.autStatesStack)-1] @@ -200,58 +202,62 @@ func (i *FSTIterator) next(lastOffset int) error { return nil } - nextOffset := lastOffset + 1 - if nextOffset < curr.NumTransitions() { + numTrans := curr.NumTransitions() + + INNER: + for nextOffset < numTrans { t := curr.TransitionAt(nextOffset) autNext := i.aut.Accept(autCurr, t) - if i.aut.CanMatch(autNext) { - pos, nextAddr, v := curr.TransitionFor(t) - - // the next slot in the statesStack might have an - // fstState instance that we can reuse - var nextPrealloc fstState - if len(i.statesStack) < cap(i.statesStack) { - nextPrealloc = i.statesStack[0:cap(i.statesStack)][len(i.statesStack)] - } + if !i.aut.CanMatch(autNext) { + nextOffset += 1 + continue INNER + } - // push onto stack - next, err := i.f.decoder.stateAt(nextAddr, nextPrealloc) - if err != nil { - return err - } - i.statesStack = append(i.statesStack, next) - i.keysStack = append(i.keysStack, t) - i.keysPosStack = append(i.keysPosStack, pos) - i.valsStack = append(i.valsStack, v) - i.autStatesStack = append(i.autStatesStack, autNext) - lastOffset = -1 - - // check to see if new keystack might have gone too far - if i.endKeyExclusive != nil && bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { - return ErrIteratorDone - } - } else { - lastOffset = nextOffset + pos, nextAddr, v := curr.TransitionFor(t) + + // the next slot in the statesStack might have an + // fstState instance that we can reuse + var nextPrealloc fstState + if len(i.statesStack) < cap(i.statesStack) { + nextPrealloc = i.statesStack[0:cap(i.statesStack)][len(i.statesStack)] } - continue + // push onto stack + next, err := i.f.decoder.stateAt(nextAddr, nextPrealloc) + if err != nil { + return err + } + + i.statesStack = append(i.statesStack, next) + i.keysStack = append(i.keysStack, t) + i.keysPosStack = append(i.keysPosStack, pos) + i.valsStack = append(i.valsStack, v) + i.autStatesStack = append(i.autStatesStack, autNext) + + // check to see if new keystack might have gone too far + if i.endKeyExclusive != nil && + bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { + return ErrIteratorDone + } + + nextOffset = 0 + continue OUTER } - if len(i.statesStack) > 1 { - // no transitions, and still room to pop - i.statesStack = i.statesStack[:len(i.statesStack)-1] - i.keysStack = i.keysStack[:len(i.keysStack)-1] - lastOffset = i.keysPosStack[len(i.keysPosStack)-1] - - i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-1] - i.valsStack = i.valsStack[:len(i.valsStack)-1] - i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-1] - continue - } else { + if len(i.statesStack) <= 1 { // stack len is 1 (root), can't go back further, we're done break } + // no transitions, and still room to pop + i.statesStack = i.statesStack[:len(i.statesStack)-1] + i.keysStack = i.keysStack[:len(i.keysStack)-1] + + nextOffset = i.keysPosStack[len(i.keysPosStack)-1] + 1 + + i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-1] + i.valsStack = i.valsStack[:len(i.valsStack)-1] + i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-1] } return ErrIteratorDone @@ -262,15 +268,12 @@ func (i *FSTIterator) next(lastOffset int) error { // seek operation would go past the last key, or outside the configured // startKeyInclusive/endKeyExclusive then ErrIteratorDone is returned. func (i *FSTIterator) Seek(key []byte) error { - err := i.pointTo(key) - if err != nil { - return err - } - return nil + return i.pointTo(key) } // Close will free any resources held by this iterator. func (i *FSTIterator) Close() error { - // at the moment we don't do anything, but wanted this for API completeness + // at the moment we don't do anything, + // but wanted this for API completeness return nil } diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE new file mode 100644 index 0000000000..6b0b1270ff --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go new file mode 100644 index 0000000000..4bf64fef2e --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +type FullCharacteristicVector []uint32 + +func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { + bucketID := offset / 32 + align := offset - bucketID*32 + if align == 0 { + return fcv[bucketID] & mask + } + left := fcv[bucketID] >> align + right := fcv[bucketID+1] << (32 - align) + return (left | right) & mask +} + +type tuple struct { + char rune + fcv FullCharacteristicVector +} + +type sortRunes []rune + +func (s sortRunes) Less(i, j int) bool { + return s[i] < s[j] +} + +func (s sortRunes) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortRunes) Len() int { + return len(s) +} + +func sortRune(r []rune) []rune { + sort.Sort(sortRunes(r)) + return r +} + +type Alphabet struct { + charset []tuple + index uint32 +} + +func (a *Alphabet) resetNext() { + a.index = 0 +} + +func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { + if int(a.index) >= len(a.charset) { + return 0, nil, fmt.Errorf("eof") + } + + rv := a.charset[a.index] + a.index++ + return rv.char, rv.fcv, nil +} + +func dedupe(in string) string { + lookUp := make(map[rune]struct{}, len(in)) + var rv string + for len(in) > 0 { + r, size := utf8.DecodeRuneInString(in) + in = in[size:] + if _, ok := lookUp[r]; !ok { + rv += string(r) + lookUp[r] = struct{}{} + } + } + return rv +} + +func queryChars(qChars string) Alphabet { + chars := dedupe(qChars) + inChars := sortRune([]rune(chars)) + charsets := make([]tuple, 0, len(inChars)) + + for _, c := range inChars { + tempChars := qChars + var bits []uint32 + for len(tempChars) > 0 { + var chunk string + if len(tempChars) > 32 { + chunk = tempChars[0:32] + tempChars = tempChars[32:] + } else { + chunk = tempChars + tempChars = tempChars[:0] + } + + chunkBits := uint32(0) + bit := uint32(1) + for _, chr := range chunk { + if chr == c { + chunkBits |= bit + } + bit <<= 1 + } + bits = append(bits, chunkBits) + } + bits = append(bits, 0) + charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) + } + return Alphabet{charset: charsets} +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go new file mode 100644 index 0000000000..e82a780a52 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go @@ -0,0 +1,250 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "math" +) + +const SinkState = uint32(0) + +type DFA struct { + transitions [][256]uint32 + distances []Distance + initState int + ed uint8 +} + +/// Returns the initial state +func (d *DFA) initialState() int { + return d.initState +} + +/// Returns the Levenshtein distance associated to the +/// current state. +func (d *DFA) distance(stateId int) Distance { + return d.distances[stateId] +} + +/// Returns the number of states in the `DFA`. +func (d *DFA) numStates() int { + return len(d.transitions) +} + +/// Returns the destination state reached after consuming a given byte. +func (d *DFA) transition(fromState int, b uint8) int { + return int(d.transitions[fromState][b]) +} + +func (d *DFA) eval(bytes []uint8) Distance { + state := d.initialState() + + for _, b := range bytes { + state = d.transition(state, b) + } + + return d.distance(state) +} + +func (d *DFA) Start() int { + return int(d.initialState()) +} + +func (d *DFA) IsMatch(state int) bool { + if _, ok := d.distance(state).(Exact); ok { + return true + } + return false +} + +func (d *DFA) CanMatch(state int) bool { + return state > 0 && state < d.numStates() +} + +func (d *DFA) Accept(state int, b byte) int { + return int(d.transition(state, b)) +} + +// WillAlwaysMatch returns if the specified state will always end in a +// matching state. +func (d *DFA) WillAlwaysMatch(state int) bool { + return false +} + +func fill(dest []uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +func fillTransitions(dest *[256]uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +type Utf8DFAStateBuilder struct { + dfaBuilder *Utf8DFABuilder + stateID uint32 + defaultSuccessor []uint32 +} + +func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, + toStateID uint32) { + sb.dfaBuilder.transitions[fromStateID][b] = toStateID +} + +func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { + fromStateID := sb.stateID + chars := []byte(string(in)) + lastByte := chars[len(chars)-1] + + for i, ch := range chars[:len(chars)-1] { + remNumBytes := len(chars) - i - 1 + defaultSuccessor := sb.defaultSuccessor[remNumBytes] + intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] + + if intermediateStateID == defaultSuccessor { + intermediateStateID = sb.dfaBuilder.allocate() + fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], + sb.defaultSuccessor[remNumBytes-1]) + } + + sb.addTransitionID(fromStateID, ch, intermediateStateID) + fromStateID = intermediateStateID + } + + toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) + sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) +} + +type Utf8StateId uint32 + +func original(stateId uint32) Utf8StateId { + return predecessor(stateId, 0) +} + +func predecessor(stateId uint32, numSteps uint8) Utf8StateId { + return Utf8StateId(stateId*4 + uint32(numSteps)) +} + +// Utf8DFABuilder makes it possible to define a DFA +// that takes unicode character, and build a `DFA` +// that operates on utf-8 encoded +type Utf8DFABuilder struct { + index []uint32 + distances []Distance + transitions [][256]uint32 + initialState uint32 + numStates uint32 + maxNumStates uint32 +} + +func withMaxStates(maxStates uint32) *Utf8DFABuilder { + rv := &Utf8DFABuilder{ + index: make([]uint32, maxStates*2+100), + distances: make([]Distance, 0, maxStates), + transitions: make([][256]uint32, 0, maxStates), + maxNumStates: maxStates, + } + + for i := range rv.index { + rv.index[i] = math.MaxUint32 + } + + return rv +} + +func (dfab *Utf8DFABuilder) allocate() uint32 { + newState := dfab.numStates + dfab.numStates++ + + dfab.distances = append(dfab.distances, Atleast{d: 255}) + dfab.transitions = append(dfab.transitions, [256]uint32{}) + + return newState +} + +func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { + if int(state) >= cap(dfab.index) { + cloneIndex := make([]uint32, int(state)*2) + copy(cloneIndex, dfab.index) + dfab.index = cloneIndex + } + if dfab.index[state] != math.MaxUint32 { + return dfab.index[state] + } + + nstate := dfab.allocate() + dfab.index[state] = nstate + + return nstate +} + +func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { + decodedID := dfab.getOrAllocate(original(iState)) + dfab.initialState = decodedID +} + +func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { + return &DFA{ + transitions: dfab.transitions, + distances: dfab.distances, + initState: int(dfab.initialState), + ed: ed, + } +} + +func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, + distance Distance) (*Utf8DFAStateBuilder, error) { + if state > dfab.maxNumStates { + return nil, fmt.Errorf("State id is larger than maxNumStates") + } + + stateID := dfab.getOrAllocate(original(state)) + dfab.distances[stateID] = distance + + defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) + // creates a chain of states of predecessors of `default_suc_orig`. + // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` + // leads to the `default_suc_orig` state. + predecessorStates := []uint32{defaultSuccID, + defaultSuccID, + defaultSuccID, + defaultSuccID} + + for numBytes := uint8(1); numBytes < 4; numBytes++ { + predecessorState := predecessor(default_suc_orig, numBytes) + predecessorStateID := dfab.getOrAllocate(predecessorState) + predecessorStates[numBytes] = predecessorStateID + succ := predecessorStates[numBytes-1] + fillTransitions(&dfab.transitions[predecessorStateID], succ) + } + + // 1-byte encoded chars. + fill(dfab.transitions[stateID][0:192], predecessorStates[0]) + // 2-bytes encoded chars. + fill(dfab.transitions[stateID][192:224], predecessorStates[1]) + // 3-bytes encoded chars. + fill(dfab.transitions[stateID][224:240], predecessorStates[2]) + // 4-bytes encoded chars. + fill(dfab.transitions[stateID][240:256], predecessorStates[3]) + + return &Utf8DFAStateBuilder{ + dfaBuilder: dfab, + stateID: stateID, + defaultSuccessor: predecessorStates}, nil +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go new file mode 100644 index 0000000000..1ca0aaa65b --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go @@ -0,0 +1,64 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import "fmt" + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requires too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +// LevenshteinAutomatonBuilder wraps a precomputed +// datastructure that allows to produce small (but not minimal) DFA. +type LevenshteinAutomatonBuilder struct { + pDfa *ParametricDFA +} + +// NewLevenshteinAutomatonBuilder creates a +// reusable, threadsafe Levenshtein automaton builder. +// `maxDistance` - maximum distance considered by the automaton. +// `transposition` - assign a distance of 1 for transposition +// +// Building this automaton builder is computationally intensive. +// While it takes only a few milliseconds for `d=2`, it grows +// exponentially with `d`. It is only reasonable to `d <= 5`. +func NewLevenshteinAutomatonBuilder(maxDistance uint8, + transposition bool) (*LevenshteinAutomatonBuilder, error) { + lnfa := newLevenshtein(maxDistance, transposition) + + pdfa, err := fromNfa(lnfa) + if err != nil { + return nil, err + } + + return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil +} + +// BuildDfa builds the levenshtein automaton for serving +// queries with a given edit distance. +func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string, + fuzziness uint8) (*DFA, error) { + return lab.pDfa.buildDfa(query, fuzziness, false) +} + +// MaxDistance returns the MaxEdit distance supported by the +// LevenshteinAutomatonBuilder builder. +func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 { + return lab.pDfa.maxDistance +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go new file mode 100644 index 0000000000..bed9b99d56 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go @@ -0,0 +1,292 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "math" + "sort" +) + +/// Levenshtein Distance computed by a Levenshtein Automaton. +/// +/// Levenshtein automata can only compute the exact Levenshtein distance +/// up to a given `max_distance`. +/// +/// Over this distance, the automaton will invariably +/// return `Distance::AtLeast(max_distance + 1)`. +type Distance interface { + distance() uint8 +} + +type Exact struct { + d uint8 +} + +func (e Exact) distance() uint8 { + return e.d +} + +type Atleast struct { + d uint8 +} + +func (a Atleast) distance() uint8 { + return a.d +} + +func characteristicVector(query []rune, c rune) uint64 { + chi := uint64(0) + for i := 0; i < len(query); i++ { + if query[i] == c { + chi |= 1 << uint64(i) + } + } + return chi +} + +type NFAState struct { + Offset uint32 + Distance uint8 + InTranspose bool +} + +type NFAStates []NFAState + +func (ns NFAStates) Len() int { + return len(ns) +} + +func (ns NFAStates) Less(i, j int) bool { + if ns[i].Offset != ns[j].Offset { + return ns[i].Offset < ns[j].Offset + } + + if ns[i].Distance != ns[j].Distance { + return ns[i].Distance < ns[j].Distance + } + + return !ns[i].InTranspose && ns[j].InTranspose +} + +func (ns NFAStates) Swap(i, j int) { + ns[i], ns[j] = ns[j], ns[i] +} + +func (ns *NFAState) imply(other NFAState) bool { + transposeImply := ns.InTranspose + if !other.InTranspose { + transposeImply = !other.InTranspose + } + + deltaOffset := ns.Offset - other.Offset + if ns.Offset < other.Offset { + deltaOffset = other.Offset - ns.Offset + } + + if transposeImply { + return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset) + } + + return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset) +} + +type MultiState struct { + states []NFAState +} + +func (ms *MultiState) States() []NFAState { + return ms.states +} + +func (ms *MultiState) Clear() { + ms.states = ms.states[:0] +} + +func newMultiState() *MultiState { + return &MultiState{states: make([]NFAState, 0)} +} + +func (ms *MultiState) normalize() uint32 { + minOffset := uint32(math.MaxUint32) + + for _, s := range ms.states { + if s.Offset < minOffset { + minOffset = s.Offset + } + } + if minOffset == uint32(math.MaxUint32) { + minOffset = 0 + } + + for i := 0; i < len(ms.states); i++ { + ms.states[i].Offset -= minOffset + } + + sort.Sort(NFAStates(ms.states)) + + return minOffset +} + +func (ms *MultiState) addStates(nState NFAState) { + + for _, s := range ms.states { + if s.imply(nState) { + return + } + } + + i := 0 + for i < len(ms.states) { + if nState.imply(ms.states[i]) { + ms.states = append(ms.states[:i], ms.states[i+1:]...) + } else { + i++ + } + } + ms.states = append(ms.states, nState) + +} + +func extractBit(bitset uint64, pos uint8) bool { + shift := bitset >> pos + bit := shift & 1 + return bit == uint64(1) +} + +func dist(left, right uint32) uint32 { + if left > right { + return left - right + } + return right - left +} + +type LevenshteinNFA struct { + mDistance uint8 + damerau bool +} + +func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA { + return &LevenshteinNFA{mDistance: maxD, + damerau: transposition, + } +} + +func (la *LevenshteinNFA) maxDistance() uint8 { + return la.mDistance +} + +func (la *LevenshteinNFA) msDiameter() uint8 { + return 2*la.mDistance + 1 +} + +func (la *LevenshteinNFA) initialStates() *MultiState { + ms := MultiState{} + nfaState := NFAState{} + ms.addStates(nfaState) + return &ms +} + +func (la *LevenshteinNFA) multistateDistance(ms *MultiState, + queryLen uint32) Distance { + minDistance := Atleast{d: la.mDistance + 1} + for _, s := range ms.states { + t := s.Distance + uint8(dist(queryLen, s.Offset)) + if t <= uint8(la.mDistance) { + if minDistance.distance() > t { + minDistance.d = t + } + } + } + + if minDistance.distance() == la.mDistance+1 { + return Atleast{d: la.mDistance + 1} + } + + return minDistance +} + +func (la *LevenshteinNFA) simpleTransition(state NFAState, + symbol uint64, ms *MultiState) { + + if state.Distance < la.mDistance { + // insertion + ms.addStates(NFAState{Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: false}) + + // substitution + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance + 1, + InTranspose: false}) + + n := la.mDistance + 1 - state.Distance + for d := uint8(1); d < n; d++ { + if extractBit(symbol, d) { + // for d > 0, as many deletion and character match + ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d), + Distance: state.Distance + d, + InTranspose: false}) + } + } + + if la.damerau && extractBit(symbol, 1) { + ms.addStates(NFAState{ + Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: true}) + } + + } + + if extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance, + InTranspose: false}) + } + + if state.InTranspose && extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 2, + Distance: state.Distance, + InTranspose: false}) + } + +} + +func (la *LevenshteinNFA) transition(cState *MultiState, + dState *MultiState, scv uint64) { + dState.Clear() + mask := (uint64(1) << la.msDiameter()) - uint64(1) + + for _, state := range cState.states { + cv := (scv >> state.Offset) & mask + la.simpleTransition(state, cv, dState) + } + + sort.Sort(NFAStates(dState.states)) +} + +func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance { + cState := la.initialStates() + nState := newMultiState() + + for _, i := range other { + nState.Clear() + chi := characteristicVector(query, i) + la.transition(cState, nState, chi) + cState, nState = nState, cState + } + + return la.multistateDistance(cState, uint32(len(query))) +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go new file mode 100644 index 0000000000..ebd9311959 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go @@ -0,0 +1,349 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "crypto/md5" + "encoding/json" + "fmt" + "math" +) + +type ParametricState struct { + shapeID uint32 + offset uint32 +} + +func newParametricState() ParametricState { + return ParametricState{} +} + +func (ps *ParametricState) isDeadEnd() bool { + return ps.shapeID == 0 +} + +type Transition struct { + destShapeID uint32 + deltaOffset uint32 +} + +func (t *Transition) apply(state ParametricState) ParametricState { + ps := ParametricState{ + shapeID: t.destShapeID} + // don't need any offset if we are in the dead state, + // this ensures we have only one dead state. + if t.destShapeID != 0 { + ps.offset = state.offset + t.deltaOffset + } + + return ps +} + +type ParametricStateIndex struct { + stateIndex []uint32 + stateQueue []ParametricState + numOffsets uint32 +} + +func newParametricStateIndex(queryLen, + numParamState uint32) ParametricStateIndex { + numOffsets := queryLen + 1 + if numParamState == 0 { + numParamState = numOffsets + } + maxNumStates := numParamState * numOffsets + psi := ParametricStateIndex{ + stateIndex: make([]uint32, maxNumStates), + stateQueue: make([]ParametricState, 0, 150), + numOffsets: numOffsets, + } + + for i := uint32(0); i < maxNumStates; i++ { + psi.stateIndex[i] = math.MaxUint32 + } + return psi +} + +func (psi *ParametricStateIndex) numStates() int { + return len(psi.stateQueue) +} + +func (psi *ParametricStateIndex) maxNumStates() int { + return len(psi.stateIndex) +} + +func (psi *ParametricStateIndex) get(stateID uint32) ParametricState { + return psi.stateQueue[stateID] +} + +func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 { + bucket := ps.shapeID*psi.numOffsets + ps.offset + if bucket < uint32(len(psi.stateIndex)) && + psi.stateIndex[bucket] != math.MaxUint32 { + return psi.stateIndex[bucket] + } + nState := uint32(len(psi.stateQueue)) + psi.stateQueue = append(psi.stateQueue, ps) + + psi.stateIndex[bucket] = nState + return nState +} + +type ParametricDFA struct { + distance []uint8 + transitions []Transition + maxDistance uint8 + transitionStride uint32 + diameter uint32 +} + +func (pdfa *ParametricDFA) initialState() ParametricState { + return ParametricState{shapeID: 1} +} + +// Returns true iff whatever characters come afterward, +// we will never reach a shorter distance +func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool { + if state.isDeadEnd() { + return true + } + + remOffset := queryLen - state.offset + if remOffset < pdfa.diameter { + stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:] + prefixDistance := stateDistances[remOffset] + if prefixDistance > pdfa.maxDistance { + return false + } + + for _, d := range stateDistances { + if d < prefixDistance { + return false + } + } + return true + } + return false +} + +func (pdfa *ParametricDFA) numStates() int { + return len(pdfa.transitions) / int(pdfa.transitionStride) +} + +func min(x, y uint32) uint32 { + if x < y { + return x + } + return y +} + +func (pdfa *ParametricDFA) transition(state ParametricState, + chi uint32) Transition { + return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi] +} + +func (pdfa *ParametricDFA) getDistance(state ParametricState, + qLen uint32) Distance { + remainingOffset := qLen - state.offset + if state.isDeadEnd() || remainingOffset >= pdfa.diameter { + return Atleast{d: pdfa.maxDistance + 1} + } + dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)] + if dist > pdfa.maxDistance { + return Atleast{d: dist} + } + return Exact{d: dist} +} + +func (pdfa *ParametricDFA) computeDistance(left, right string) Distance { + state := pdfa.initialState() + leftChars := []rune(left) + for _, chr := range []rune(right) { + start := state.offset + stop := min(start+pdfa.diameter, uint32(len(leftChars))) + chi := characteristicVector(leftChars[start:stop], chr) + transition := pdfa.transition(state, uint32(chi)) + state = transition.apply(state) + if state.isDeadEnd() { + return Atleast{d: pdfa.maxDistance + 1} + } + } + return pdfa.getDistance(state, uint32(len(left))) +} + +func (pdfa *ParametricDFA) buildDfa(query string, distance uint8, + prefix bool) (*DFA, error) { + qLen := uint32(len([]rune(query))) + alphabet := queryChars(query) + + psi := newParametricStateIndex(qLen, uint32(pdfa.numStates())) + maxNumStates := psi.maxNumStates() + deadEndStateID := psi.getOrAllocate(newParametricState()) + if deadEndStateID != 0 { + return nil, fmt.Errorf("Invalid dead end state") + } + + initialStateID := psi.getOrAllocate(pdfa.initialState()) + dfaBuilder := withMaxStates(uint32(maxNumStates)) + mask := uint32((1 << pdfa.diameter) - 1) + + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == psi.numStates() { + break + } + state := psi.get(uint32(stateID)) + if prefix && pdfa.isPrefixSink(state, qLen) { + distance := pdfa.getDistance(state, qLen) + dfaBuilder.addState(uint32(stateID), uint32(stateID), distance) + } else { + transition := pdfa.transition(state, 0) + defSuccessor := transition.apply(state) + defSuccessorID := psi.getOrAllocate(defSuccessor) + distance := pdfa.getDistance(state, qLen) + stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance) + + if err != nil { + return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err) + } + + alphabet.resetNext() + chr, cv, err := alphabet.next() + for err == nil { + chi := cv.shiftAndMask(state.offset, mask) + + transition := pdfa.transition(state, chi) + + destState := transition.apply(state) + + destStateID := psi.getOrAllocate(destState) + + stateBuilder.addTransition(chr, destStateID) + + chr, cv, err = alphabet.next() + } + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + dfaBuilder.setInitialState(initialStateID) + return dfaBuilder.build(distance), nil +} + +func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { + lookUp := newHash() + lookUp.getOrAllocate(*newMultiState()) + initialState := nfa.initialStates() + lookUp.getOrAllocate(*initialState) + + maxDistance := nfa.maxDistance() + msDiameter := nfa.msDiameter() + + numChi := 1 << msDiameter + chiValues := make([]uint64, numChi) + for i := 0; i < numChi; i++ { + chiValues[i] = uint64(i) + } + + transitions := make([]Transition, 0, numChi*int(msDiameter)) + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == len(lookUp.items) { + break + } + + for _, chi := range chiValues { + destMs := newMultiState() + + ms := lookUp.getFromID(stateID) + + nfa.transition(ms, destMs, chi) + + translation := destMs.normalize() + + destID := lookUp.getOrAllocate(*destMs) + + transitions = append(transitions, Transition{ + destShapeID: uint32(destID), + deltaOffset: translation, + }) + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + ns := len(lookUp.items) + diameter := int(msDiameter) + + distances := make([]uint8, 0, diameter*ns) + for stateID := 0; stateID < ns; stateID++ { + ms := lookUp.getFromID(stateID) + for offset := 0; offset < diameter; offset++ { + dist := nfa.multistateDistance(ms, uint32(offset)) + distances = append(distances, dist.distance()) + } + } + + return &ParametricDFA{ + diameter: uint32(msDiameter), + transitions: transitions, + maxDistance: maxDistance, + transitionStride: uint32(numChi), + distance: distances, + }, nil +} + +type hash struct { + index map[[16]byte]int + items []MultiState +} + +func newHash() *hash { + return &hash{ + index: make(map[[16]byte]int, 100), + items: make([]MultiState, 0, 100), + } +} + +func (h *hash) getOrAllocate(m MultiState) int { + size := len(h.items) + var exists bool + var pos int + md5 := getHash(&m) + if pos, exists = h.index[md5]; !exists { + h.index[md5] = size + pos = size + h.items = append(h.items, m) + } + return pos +} + +func (h *hash) getFromID(id int) *MultiState { + return &h.items[id] +} + +func getHash(ms *MultiState) [16]byte { + msBytes := []byte{} + for _, state := range ms.states { + jsonBytes, _ := json.Marshal(&state) + msBytes = append(msBytes, jsonBytes...) + } + return md5.Sum(msBytes) +} diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go index 6922b749db..55280164c7 100644 --- a/vendor/github.com/couchbase/vellum/regexp/compile.go +++ b/vendor/github.com/couchbase/vellum/regexp/compile.go @@ -18,17 +18,27 @@ import ( "regexp/syntax" "unicode" + unicode_utf8 "unicode/utf8" + "github.com/couchbase/vellum/utf8" ) type compiler struct { sizeLimit uint insts prog + instsPool []inst + + sequences utf8.Sequences + rangeStack utf8.RangeStack + startBytes []byte + endBytes []byte } func newCompiler(sizeLimit uint) *compiler { return &compiler{ - sizeLimit: sizeLimit, + sizeLimit: sizeLimit, + startBytes: make([]byte, unicode_utf8.UTFMax), + endBytes: make([]byte, unicode_utf8.UTFMax), } } @@ -37,13 +47,13 @@ func (c *compiler) compile(ast *syntax.Regexp) (prog, error) { if err != nil { return nil, err } - c.insts = append(c.insts, &inst{ - op: OpMatch, - }) + inst := c.allocInst() + inst.op = OpMatch + c.insts = append(c.insts, inst) return c.insts, nil } -func (c *compiler) c(ast *syntax.Regexp) error { +func (c *compiler) c(ast *syntax.Regexp) (err error) { if ast.Flags&syntax.NonGreedy > 1 { return ErrNoLazy } @@ -67,11 +77,12 @@ func (c *compiler) c(ast *syntax.Regexp) error { next.Rune = next.Rune0[0:2] return c.c(&next) } - seqs, err := utf8.NewSequences(r, r) + c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( + r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes) if err != nil { return err } - for _, seq := range seqs { + for _, seq := range c.sequences { c.compileUtf8Ranges(seq) } } @@ -106,8 +117,7 @@ func (c *compiler) c(ast *syntax.Regexp) error { if len(ast.Sub) == 0 { return nil } - jmpsToEnd := []uint{} - + jmpsToEnd := make([]uint, 0, len(ast.Sub)-1) // does not handle last entry for i := 0; i < len(ast.Sub)-1; i++ { sub := ast.Sub[i] @@ -188,7 +198,8 @@ func (c *compiler) c(ast *syntax.Regexp) error { return err } } - var splits, starts []uint + splits := make([]uint, 0, ast.Max-ast.Min) + starts := make([]uint, 0, ast.Max-ast.Min) for i := ast.Min; i < ast.Max; i++ { splits = append(splits, c.emptySplit()) starts = append(starts, uint(len(c.insts))) @@ -218,8 +229,7 @@ func (c *compiler) compileClass(ast *syntax.Regexp) error { if len(ast.Rune) == 0 { return nil } - var jmps []uint - + jmps := make([]uint, 0, len(ast.Rune)-2) // does not do last pair for i := 0; i < len(ast.Rune)-2; i += 2 { rstart := ast.Rune[i] @@ -249,16 +259,16 @@ func (c *compiler) compileClass(ast *syntax.Regexp) error { return nil } -func (c *compiler) compileClassRange(startR, endR rune) error { - seqs, err := utf8.NewSequences(startR, endR) +func (c *compiler) compileClassRange(startR, endR rune) (err error) { + c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( + startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes) if err != nil { return err } - var jmps []uint - + jmps := make([]uint, 0, len(c.sequences)-1) // does not do last entry - for i := 0; i < len(seqs)-1; i++ { - seq := seqs[i] + for i := 0; i < len(c.sequences)-1; i++ { + seq := c.sequences[i] split := c.emptySplit() j1 := c.top() c.compileUtf8Ranges(seq) @@ -267,7 +277,7 @@ func (c *compiler) compileClassRange(startR, endR rune) error { c.setSplit(split, j1, j2) } // handle last entry - c.compileUtf8Ranges(seqs[len(seqs)-1]) + c.compileUtf8Ranges(c.sequences[len(c.sequences)-1]) end := c.top() for _, jmp := range jmps { c.setJump(jmp, end) @@ -278,25 +288,25 @@ func (c *compiler) compileClassRange(startR, endR rune) error { func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) { for _, r := range seq { - c.insts = append(c.insts, &inst{ - op: OpRange, - rangeStart: r.Start, - rangeEnd: r.End, - }) + inst := c.allocInst() + inst.op = OpRange + inst.rangeStart = r.Start + inst.rangeEnd = r.End + c.insts = append(c.insts, inst) } } func (c *compiler) emptySplit() uint { - c.insts = append(c.insts, &inst{ - op: OpSplit, - }) + inst := c.allocInst() + inst.op = OpSplit + c.insts = append(c.insts, inst) return c.top() - 1 } func (c *compiler) emptyJump() uint { - c.insts = append(c.insts, &inst{ - op: OpJmp, - }) + inst := c.allocInst() + inst.op = OpJmp + c.insts = append(c.insts, inst) return c.top() - 1 } @@ -314,3 +324,12 @@ func (c *compiler) setJump(i, pc uint) { func (c *compiler) top() uint { return uint(len(c.insts)) } + +func (c *compiler) allocInst() *inst { + if len(c.instsPool) <= 0 { + c.instsPool = make([]inst, 16) + } + inst := &c.instsPool[0] + c.instsPool = c.instsPool[1:] + return inst +} diff --git a/vendor/github.com/couchbase/vellum/regexp/dfa.go b/vendor/github.com/couchbase/vellum/regexp/dfa.go index 9864606b6a..7e6fb29dac 100644 --- a/vendor/github.com/couchbase/vellum/regexp/dfa.go +++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go @@ -23,7 +23,7 @@ import ( const StateLimit = 10000 // ErrTooManyStates is returned if you attempt to build a Levenshtein -// automaton which requries too many states. +// automaton which requires too many states. var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", StateLimit) @@ -37,12 +37,12 @@ func newDfaBuilder(insts prog) *dfaBuilder { d := &dfaBuilder{ dfa: &dfa{ insts: insts, - states: make([]*state, 0, 16), + states: make([]state, 0, 16), }, cache: make(map[string]int, 1024), } // add 0 state that is invalid - d.dfa.states = append(d.dfa.states, &state{ + d.dfa.states = append(d.dfa.states, state{ next: make([]int, 256), match: false, }) @@ -54,13 +54,15 @@ func (d *dfaBuilder) build() (*dfa, error) { next := newSparseSet(uint(len(d.dfa.insts))) d.dfa.add(cur, 0) - states := intStack{d.cachedState(cur)} + ns, instsReuse := d.cachedState(cur, nil) + states := intStack{ns} seen := make(map[int]struct{}) var s int states, s = states.Pop() for s != 0 { for b := 0; b < 256; b++ { - ns := d.runState(cur, next, s, byte(b)) + var ns int + ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse) if ns != 0 { if _, ok := seen[ns]; !ok { seen[ns] = struct{}{} @@ -76,15 +78,17 @@ func (d *dfaBuilder) build() (*dfa, error) { return d.dfa, nil } -func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int { +func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) ( + int, []uint) { cur.Clear() for _, ip := range d.dfa.states[state].insts { cur.Add(ip) } d.dfa.run(cur, next, b) - nextState := d.cachedState(next) + var nextState int + nextState, instsReuse = d.cachedState(next, instsReuse) d.dfa.states[state].next[b] = nextState - return nextState + return nextState, instsReuse } func instsKey(insts []uint, buf []byte) []byte { @@ -99,8 +103,12 @@ func instsKey(insts []uint, buf []byte) []byte { return buf } -func (d *dfaBuilder) cachedState(set *sparseSet) int { - var insts []uint +func (d *dfaBuilder) cachedState(set *sparseSet, + instsReuse []uint) (int, []uint) { + insts := instsReuse[:0] + if cap(insts) == 0 { + insts = make([]uint, 0, set.Len()) + } var isMatch bool for i := uint(0); i < uint(set.Len()); i++ { ip := set.Get(i) @@ -113,26 +121,26 @@ func (d *dfaBuilder) cachedState(set *sparseSet) int { } } if len(insts) == 0 { - return 0 + return 0, insts } d.keyBuf = instsKey(insts, d.keyBuf) v, ok := d.cache[string(d.keyBuf)] if ok { - return v + return v, insts } - d.dfa.states = append(d.dfa.states, &state{ + d.dfa.states = append(d.dfa.states, state{ insts: insts, next: make([]int, 256), match: isMatch, }) newV := len(d.dfa.states) - 1 d.cache[string(d.keyBuf)] = newV - return newV + return newV, nil } type dfa struct { insts prog - states []*state + states []state } func (d *dfa) add(set *sparseSet, ip uint) { diff --git a/vendor/github.com/couchbase/vellum/regexp/inst.go b/vendor/github.com/couchbase/vellum/regexp/inst.go index 61cbf2f333..36f2e602df 100644 --- a/vendor/github.com/couchbase/vellum/regexp/inst.go +++ b/vendor/github.com/couchbase/vellum/regexp/inst.go @@ -27,7 +27,7 @@ const ( OpRange ) -// instSize is the approxmiate size of the an inst struct in bytes +// instSize is the approximate size of the an inst struct in bytes const instSize = 40 type inst struct { diff --git a/vendor/github.com/couchbase/vellum/regexp/regexp.go b/vendor/github.com/couchbase/vellum/regexp/regexp.go index ed0e7823e1..920ddc3708 100644 --- a/vendor/github.com/couchbase/vellum/regexp/regexp.go +++ b/vendor/github.com/couchbase/vellum/regexp/regexp.go @@ -35,6 +35,8 @@ var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed") // too many instructions var ErrCompiledTooBig = fmt.Errorf("too many instructions") +var DefaultLimit = uint(10 * (1 << 20)) + // Regexp implements the vellum.Automaton interface for matcing a user // specified regular expression. type Regexp struct { @@ -47,7 +49,7 @@ type Regexp struct { // compiled finite state automaton. If this size is exceeded, // ErrCompiledTooBig will be returned. func New(expr string) (*Regexp, error) { - return NewWithLimit(expr, 10*(1<<20)) + return NewWithLimit(expr, DefaultLimit) } // NewRegexpWithLimit creates a new Regular Expression automaton with @@ -59,6 +61,10 @@ func NewWithLimit(expr string, size uint) (*Regexp, error) { if err != nil { return nil, err } + return NewParsedWithLimit(expr, parsed, size) +} + +func NewParsedWithLimit(expr string, parsed *syntax.Regexp, size uint) (*Regexp, error) { compiler := newCompiler(size) insts, err := compiler.compile(parsed) if err != nil { @@ -103,7 +109,7 @@ func (r *Regexp) WillAlwaysMatch(int) bool { return false } -// Accept returns the new state, resulting from the transite byte b +// Accept returns the new state, resulting from the transition byte b // when currently in the state s. func (r *Regexp) Accept(s int, b byte) int { if s < len(r.dfa.states) { diff --git a/vendor/github.com/couchbase/vellum/registry.go b/vendor/github.com/couchbase/vellum/registry.go index 3721a7c9c3..f5b9b4d59c 100644 --- a/vendor/github.com/couchbase/vellum/registry.go +++ b/vendor/github.com/couchbase/vellum/registry.go @@ -14,39 +14,35 @@ package vellum -import ( - "hash" - "hash/fnv" -) - type registryCell struct { addr int node *builderNode } type registry struct { - table []registryCell - tableSize uint - mruSize uint - hasher hash.Hash64 + builderNodePool *builderNodePool + table []registryCell + tableSize uint + mruSize uint } -func newRegistry(tableSize, mruSize int) *registry { +func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry { nsize := tableSize * mruSize rv := ®istry{ - table: make([]registryCell, nsize), - tableSize: uint(tableSize), - mruSize: uint(mruSize), - hasher: fnv.New64a(), + builderNodePool: p, + table: make([]registryCell, nsize), + tableSize: uint(tableSize), + mruSize: uint(mruSize), } return rv } func (r *registry) Reset() { - for i := 0; i < len(r.table); i++ { - r.table[i] = registryCell{} + var empty registryCell + for i := range r.table { + r.builderNodePool.Put(r.table[i].node) + r.table[i] = empty } - r.hasher.Reset() } func (r *registry) entry(node *builderNode) (bool, int, *registryCell) { @@ -57,7 +53,7 @@ func (r *registry) entry(node *builderNode) (bool, int, *registryCell) { start := r.mruSize * uint(bucket) end := start + r.mruSize rc := registryCache(r.table[start:end]) - return rc.entry(node) + return rc.entry(node, r.builderNodePool) } const fnvPrime = 1099511628211 @@ -81,11 +77,12 @@ func (r *registry) hash(b *builderNode) int { type registryCache []registryCell -func (r registryCache) entry(node *builderNode) (bool, int, *registryCell) { +func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) { if len(r) == 1 { if r[0].node != nil && r[0].node.equiv(node) { return true, r[0].addr, nil } + pool.Put(r[0].node) r[0].node = node return false, 0, &r[0] } @@ -98,6 +95,7 @@ func (r registryCache) entry(node *builderNode) (bool, int, *registryCell) { } // no match last := len(r) - 1 + pool.Put(r[last].node) r[last].node = node // discard LRU r.promote(last) return false, 0, &r[0] diff --git a/vendor/github.com/couchbase/vellum/utf8/utf8.go b/vendor/github.com/couchbase/vellum/utf8/utf8.go index 47dbe9d1c5..54e23b937c 100644 --- a/vendor/github.com/couchbase/vellum/utf8/utf8.go +++ b/vendor/github.com/couchbase/vellum/utf8/utf8.go @@ -25,19 +25,39 @@ type Sequences []Sequence // NewSequences constructs a collection of Sequence which describe the // byte ranges covered between the start and end runes. func NewSequences(start, end rune) (Sequences, error) { - var rv Sequences + rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil) + return rv, err +} + +func NewSequencesPrealloc(start, end rune, + preallocSequences Sequences, + preallocRangeStack RangeStack, + preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) { + rv := preallocSequences[:0] + + startBytes := preallocStartBytes + if cap(startBytes) < utf8.UTFMax { + startBytes = make([]byte, utf8.UTFMax) + } + startBytes = startBytes[:utf8.UTFMax] - var rangeStack rangeStack - rangeStack = rangeStack.Push(&scalarRange{start, end}) + endBytes := preallocEndBytes + if cap(endBytes) < utf8.UTFMax { + endBytes = make([]byte, utf8.UTFMax) + } + endBytes = endBytes[:utf8.UTFMax] + + rangeStack := preallocRangeStack[:0] + rangeStack = rangeStack.Push(scalarRange{start, end}) rangeStack, r := rangeStack.Pop() TOP: - for r != nil { + for r != nilScalarRange { INNER: for { r1, r2 := r.split() - if r1 != nil { - rangeStack = rangeStack.Push(&scalarRange{r2.start, r2.end}) + if r1 != nilScalarRange { + rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end}) r.start = r1.start r.end = r1.end continue INNER @@ -49,13 +69,13 @@ TOP: for i := 1; i < utf8.UTFMax; i++ { max := maxScalarValue(i) if r.start <= max && max < r.end { - rangeStack = rangeStack.Push(&scalarRange{max + 1, r.end}) + rangeStack = rangeStack.Push(scalarRange{max + 1, r.end}) r.end = max continue INNER } } asciiRange := r.ascii() - if asciiRange != nil { + if asciiRange != nilRange { rv = append(rv, Sequence{ asciiRange, }) @@ -66,23 +86,21 @@ TOP: m := rune((1 << (6 * i)) - 1) if (r.start & ^m) != (r.end & ^m) { if (r.start & m) != 0 { - rangeStack = rangeStack.Push(&scalarRange{(r.start | m) + 1, r.end}) + rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end}) r.end = r.start | m continue INNER } if (r.end & m) != m { - rangeStack = rangeStack.Push(&scalarRange{r.end & ^m, r.end}) + rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end}) r.end = (r.end & ^m) - 1 continue INNER } } } - start := make([]byte, utf8.UTFMax) - end := make([]byte, utf8.UTFMax) - n, m := r.encode(start, end) - seq, err := SequenceFromEncodedRange(start[0:n], end[0:m]) + n, m := r.encode(startBytes, endBytes) + seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m]) if err != nil { - return nil, err + return nil, nil, err } rv = append(rv, seq) rangeStack, r = rangeStack.Pop() @@ -90,11 +108,11 @@ TOP: } } - return rv, nil + return rv, rangeStack, nil } -// Sequence is a collection of *Range -type Sequence []*Range +// Sequence is a collection of Range +type Sequence []Range // SequenceFromEncodedRange creates sequence from the encoded bytes func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { @@ -104,21 +122,21 @@ func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { switch len(start) { case 2: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, }, nil case 3: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, - &Range{start[2], end[2]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, + Range{start[2], end[2]}, }, nil case 4: return Sequence{ - &Range{start[0], end[0]}, - &Range{start[1], end[1]}, - &Range{start[2], end[2]}, - &Range{start[3], end[3]}, + Range{start[0], end[0]}, + Range{start[1], end[1]}, + Range{start[2], end[2]}, + Range{start[3], end[3]}, }, nil } @@ -159,6 +177,8 @@ type Range struct { End byte } +var nilRange = Range{0xff, 0} + func (u Range) matches(b byte) bool { if u.Start <= b && b <= u.End { return true @@ -178,37 +198,39 @@ type scalarRange struct { end rune } +var nilScalarRange = scalarRange{0xffff, 0} + func (s *scalarRange) String() string { return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end) } // split this scalar range if it overlaps with a surrogate codepoint -func (s *scalarRange) split() (*scalarRange, *scalarRange) { +func (s *scalarRange) split() (scalarRange, scalarRange) { if s.start < 0xe000 && s.end > 0xd7ff { - return &scalarRange{ + return scalarRange{ start: s.start, end: 0xd7ff, }, - &scalarRange{ + scalarRange{ start: 0xe000, end: s.end, } } - return nil, nil + return nilScalarRange, nilScalarRange } func (s *scalarRange) valid() bool { return s.start <= s.end } -func (s *scalarRange) ascii() *Range { +func (s *scalarRange) ascii() Range { if s.valid() && s.end <= 0x7f { - return &Range{ + return Range{ Start: byte(s.start), End: byte(s.end), } } - return nil + return nilRange } // start and end MUST have capacity for utf8.UTFMax bytes @@ -218,16 +240,16 @@ func (s *scalarRange) encode(start, end []byte) (int, int) { return n, m } -type rangeStack []*scalarRange +type RangeStack []scalarRange -func (s rangeStack) Push(v *scalarRange) rangeStack { +func (s RangeStack) Push(v scalarRange) RangeStack { return append(s, v) } -func (s rangeStack) Pop() (rangeStack, *scalarRange) { +func (s RangeStack) Pop() (RangeStack, scalarRange) { l := len(s) if l < 1 { - return s, nil + return s, nilScalarRange } return s[:l-1], s[l-1] } |