aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go')
-rw-r--r--vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go349
1 files changed, 349 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go
new file mode 100644
index 0000000000..ebd9311959
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go
@@ -0,0 +1,349 @@
+// Copyright (c) 2018 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package levenshtein2
+
+import (
+ "crypto/md5"
+ "encoding/json"
+ "fmt"
+ "math"
+)
+
+type ParametricState struct {
+ shapeID uint32
+ offset uint32
+}
+
+func newParametricState() ParametricState {
+ return ParametricState{}
+}
+
+func (ps *ParametricState) isDeadEnd() bool {
+ return ps.shapeID == 0
+}
+
+type Transition struct {
+ destShapeID uint32
+ deltaOffset uint32
+}
+
+func (t *Transition) apply(state ParametricState) ParametricState {
+ ps := ParametricState{
+ shapeID: t.destShapeID}
+ // don't need any offset if we are in the dead state,
+ // this ensures we have only one dead state.
+ if t.destShapeID != 0 {
+ ps.offset = state.offset + t.deltaOffset
+ }
+
+ return ps
+}
+
+type ParametricStateIndex struct {
+ stateIndex []uint32
+ stateQueue []ParametricState
+ numOffsets uint32
+}
+
+func newParametricStateIndex(queryLen,
+ numParamState uint32) ParametricStateIndex {
+ numOffsets := queryLen + 1
+ if numParamState == 0 {
+ numParamState = numOffsets
+ }
+ maxNumStates := numParamState * numOffsets
+ psi := ParametricStateIndex{
+ stateIndex: make([]uint32, maxNumStates),
+ stateQueue: make([]ParametricState, 0, 150),
+ numOffsets: numOffsets,
+ }
+
+ for i := uint32(0); i < maxNumStates; i++ {
+ psi.stateIndex[i] = math.MaxUint32
+ }
+ return psi
+}
+
+func (psi *ParametricStateIndex) numStates() int {
+ return len(psi.stateQueue)
+}
+
+func (psi *ParametricStateIndex) maxNumStates() int {
+ return len(psi.stateIndex)
+}
+
+func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
+ return psi.stateQueue[stateID]
+}
+
+func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
+ bucket := ps.shapeID*psi.numOffsets + ps.offset
+ if bucket < uint32(len(psi.stateIndex)) &&
+ psi.stateIndex[bucket] != math.MaxUint32 {
+ return psi.stateIndex[bucket]
+ }
+ nState := uint32(len(psi.stateQueue))
+ psi.stateQueue = append(psi.stateQueue, ps)
+
+ psi.stateIndex[bucket] = nState
+ return nState
+}
+
+type ParametricDFA struct {
+ distance []uint8
+ transitions []Transition
+ maxDistance uint8
+ transitionStride uint32
+ diameter uint32
+}
+
+func (pdfa *ParametricDFA) initialState() ParametricState {
+ return ParametricState{shapeID: 1}
+}
+
+// Returns true iff whatever characters come afterward,
+// we will never reach a shorter distance
+func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
+ if state.isDeadEnd() {
+ return true
+ }
+
+ remOffset := queryLen - state.offset
+ if remOffset < pdfa.diameter {
+ stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
+ prefixDistance := stateDistances[remOffset]
+ if prefixDistance > pdfa.maxDistance {
+ return false
+ }
+
+ for _, d := range stateDistances {
+ if d < prefixDistance {
+ return false
+ }
+ }
+ return true
+ }
+ return false
+}
+
+func (pdfa *ParametricDFA) numStates() int {
+ return len(pdfa.transitions) / int(pdfa.transitionStride)
+}
+
+func min(x, y uint32) uint32 {
+ if x < y {
+ return x
+ }
+ return y
+}
+
+func (pdfa *ParametricDFA) transition(state ParametricState,
+ chi uint32) Transition {
+ return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
+}
+
+func (pdfa *ParametricDFA) getDistance(state ParametricState,
+ qLen uint32) Distance {
+ remainingOffset := qLen - state.offset
+ if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
+ return Atleast{d: pdfa.maxDistance + 1}
+ }
+ dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
+ if dist > pdfa.maxDistance {
+ return Atleast{d: dist}
+ }
+ return Exact{d: dist}
+}
+
+func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
+ state := pdfa.initialState()
+ leftChars := []rune(left)
+ for _, chr := range []rune(right) {
+ start := state.offset
+ stop := min(start+pdfa.diameter, uint32(len(leftChars)))
+ chi := characteristicVector(leftChars[start:stop], chr)
+ transition := pdfa.transition(state, uint32(chi))
+ state = transition.apply(state)
+ if state.isDeadEnd() {
+ return Atleast{d: pdfa.maxDistance + 1}
+ }
+ }
+ return pdfa.getDistance(state, uint32(len(left)))
+}
+
+func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
+ prefix bool) (*DFA, error) {
+ qLen := uint32(len([]rune(query)))
+ alphabet := queryChars(query)
+
+ psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
+ maxNumStates := psi.maxNumStates()
+ deadEndStateID := psi.getOrAllocate(newParametricState())
+ if deadEndStateID != 0 {
+ return nil, fmt.Errorf("Invalid dead end state")
+ }
+
+ initialStateID := psi.getOrAllocate(pdfa.initialState())
+ dfaBuilder := withMaxStates(uint32(maxNumStates))
+ mask := uint32((1 << pdfa.diameter) - 1)
+
+ var stateID int
+ for stateID = 0; stateID < StateLimit; stateID++ {
+ if stateID == psi.numStates() {
+ break
+ }
+ state := psi.get(uint32(stateID))
+ if prefix && pdfa.isPrefixSink(state, qLen) {
+ distance := pdfa.getDistance(state, qLen)
+ dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
+ } else {
+ transition := pdfa.transition(state, 0)
+ defSuccessor := transition.apply(state)
+ defSuccessorID := psi.getOrAllocate(defSuccessor)
+ distance := pdfa.getDistance(state, qLen)
+ stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
+
+ if err != nil {
+ return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
+ }
+
+ alphabet.resetNext()
+ chr, cv, err := alphabet.next()
+ for err == nil {
+ chi := cv.shiftAndMask(state.offset, mask)
+
+ transition := pdfa.transition(state, chi)
+
+ destState := transition.apply(state)
+
+ destStateID := psi.getOrAllocate(destState)
+
+ stateBuilder.addTransition(chr, destStateID)
+
+ chr, cv, err = alphabet.next()
+ }
+ }
+ }
+
+ if stateID == StateLimit {
+ return nil, ErrTooManyStates
+ }
+
+ dfaBuilder.setInitialState(initialStateID)
+ return dfaBuilder.build(distance), nil
+}
+
+func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
+ lookUp := newHash()
+ lookUp.getOrAllocate(*newMultiState())
+ initialState := nfa.initialStates()
+ lookUp.getOrAllocate(*initialState)
+
+ maxDistance := nfa.maxDistance()
+ msDiameter := nfa.msDiameter()
+
+ numChi := 1 << msDiameter
+ chiValues := make([]uint64, numChi)
+ for i := 0; i < numChi; i++ {
+ chiValues[i] = uint64(i)
+ }
+
+ transitions := make([]Transition, 0, numChi*int(msDiameter))
+ var stateID int
+ for stateID = 0; stateID < StateLimit; stateID++ {
+ if stateID == len(lookUp.items) {
+ break
+ }
+
+ for _, chi := range chiValues {
+ destMs := newMultiState()
+
+ ms := lookUp.getFromID(stateID)
+
+ nfa.transition(ms, destMs, chi)
+
+ translation := destMs.normalize()
+
+ destID := lookUp.getOrAllocate(*destMs)
+
+ transitions = append(transitions, Transition{
+ destShapeID: uint32(destID),
+ deltaOffset: translation,
+ })
+ }
+ }
+
+ if stateID == StateLimit {
+ return nil, ErrTooManyStates
+ }
+
+ ns := len(lookUp.items)
+ diameter := int(msDiameter)
+
+ distances := make([]uint8, 0, diameter*ns)
+ for stateID := 0; stateID < ns; stateID++ {
+ ms := lookUp.getFromID(stateID)
+ for offset := 0; offset < diameter; offset++ {
+ dist := nfa.multistateDistance(ms, uint32(offset))
+ distances = append(distances, dist.distance())
+ }
+ }
+
+ return &ParametricDFA{
+ diameter: uint32(msDiameter),
+ transitions: transitions,
+ maxDistance: maxDistance,
+ transitionStride: uint32(numChi),
+ distance: distances,
+ }, nil
+}
+
+type hash struct {
+ index map[[16]byte]int
+ items []MultiState
+}
+
+func newHash() *hash {
+ return &hash{
+ index: make(map[[16]byte]int, 100),
+ items: make([]MultiState, 0, 100),
+ }
+}
+
+func (h *hash) getOrAllocate(m MultiState) int {
+ size := len(h.items)
+ var exists bool
+ var pos int
+ md5 := getHash(&m)
+ if pos, exists = h.index[md5]; !exists {
+ h.index[md5] = size
+ pos = size
+ h.items = append(h.items, m)
+ }
+ return pos
+}
+
+func (h *hash) getFromID(id int) *MultiState {
+ return &h.items[id]
+}
+
+func getHash(ms *MultiState) [16]byte {
+ msBytes := []byte{}
+ for _, state := range ms.states {
+ jsonBytes, _ := json.Marshal(&state)
+ msBytes = append(msBytes, jsonBytes...)
+ }
+ return md5.Sum(msBytes)
+}