diff options
Diffstat (limited to 'vendor/github.com/couchbase/vellum/levenshtein2')
6 files changed, 1283 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE new file mode 100644 index 0000000000..6b0b1270ff --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go new file mode 100644 index 0000000000..4bf64fef2e --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +type FullCharacteristicVector []uint32 + +func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { + bucketID := offset / 32 + align := offset - bucketID*32 + if align == 0 { + return fcv[bucketID] & mask + } + left := fcv[bucketID] >> align + right := fcv[bucketID+1] << (32 - align) + return (left | right) & mask +} + +type tuple struct { + char rune + fcv FullCharacteristicVector +} + +type sortRunes []rune + +func (s sortRunes) Less(i, j int) bool { + return s[i] < s[j] +} + +func (s sortRunes) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortRunes) Len() int { + return len(s) +} + +func sortRune(r []rune) []rune { + sort.Sort(sortRunes(r)) + return r +} + +type Alphabet struct { + charset []tuple + index uint32 +} + +func (a *Alphabet) resetNext() { + a.index = 0 +} + +func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { + if int(a.index) >= len(a.charset) { + return 0, nil, fmt.Errorf("eof") + } + + rv := a.charset[a.index] + a.index++ + return rv.char, rv.fcv, nil +} + +func dedupe(in string) string { + lookUp := make(map[rune]struct{}, len(in)) + var rv string + for len(in) > 0 { + r, size := utf8.DecodeRuneInString(in) + in = in[size:] + if _, ok := lookUp[r]; !ok { + rv += string(r) + lookUp[r] = struct{}{} + } + } + return rv +} + +func queryChars(qChars string) Alphabet { + chars := dedupe(qChars) + inChars := sortRune([]rune(chars)) + charsets := make([]tuple, 0, len(inChars)) + + for _, c := range inChars { + tempChars := qChars + var bits []uint32 + for len(tempChars) > 0 { + var chunk string + if len(tempChars) > 32 { + chunk = tempChars[0:32] + tempChars = tempChars[32:] + } else { + chunk = tempChars + tempChars = tempChars[:0] + } + + chunkBits := uint32(0) + bit := uint32(1) + for _, chr := range chunk { + if chr == c { + chunkBits |= bit + } + bit <<= 1 + } + bits = append(bits, chunkBits) + } + bits = append(bits, 0) + charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) + } + return Alphabet{charset: charsets} +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go new file mode 100644 index 0000000000..e82a780a52 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go @@ -0,0 +1,250 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "math" +) + +const SinkState = uint32(0) + +type DFA struct { + transitions [][256]uint32 + distances []Distance + initState int + ed uint8 +} + +/// Returns the initial state +func (d *DFA) initialState() int { + return d.initState +} + +/// Returns the Levenshtein distance associated to the +/// current state. +func (d *DFA) distance(stateId int) Distance { + return d.distances[stateId] +} + +/// Returns the number of states in the `DFA`. +func (d *DFA) numStates() int { + return len(d.transitions) +} + +/// Returns the destination state reached after consuming a given byte. +func (d *DFA) transition(fromState int, b uint8) int { + return int(d.transitions[fromState][b]) +} + +func (d *DFA) eval(bytes []uint8) Distance { + state := d.initialState() + + for _, b := range bytes { + state = d.transition(state, b) + } + + return d.distance(state) +} + +func (d *DFA) Start() int { + return int(d.initialState()) +} + +func (d *DFA) IsMatch(state int) bool { + if _, ok := d.distance(state).(Exact); ok { + return true + } + return false +} + +func (d *DFA) CanMatch(state int) bool { + return state > 0 && state < d.numStates() +} + +func (d *DFA) Accept(state int, b byte) int { + return int(d.transition(state, b)) +} + +// WillAlwaysMatch returns if the specified state will always end in a +// matching state. +func (d *DFA) WillAlwaysMatch(state int) bool { + return false +} + +func fill(dest []uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +func fillTransitions(dest *[256]uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +type Utf8DFAStateBuilder struct { + dfaBuilder *Utf8DFABuilder + stateID uint32 + defaultSuccessor []uint32 +} + +func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, + toStateID uint32) { + sb.dfaBuilder.transitions[fromStateID][b] = toStateID +} + +func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { + fromStateID := sb.stateID + chars := []byte(string(in)) + lastByte := chars[len(chars)-1] + + for i, ch := range chars[:len(chars)-1] { + remNumBytes := len(chars) - i - 1 + defaultSuccessor := sb.defaultSuccessor[remNumBytes] + intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] + + if intermediateStateID == defaultSuccessor { + intermediateStateID = sb.dfaBuilder.allocate() + fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], + sb.defaultSuccessor[remNumBytes-1]) + } + + sb.addTransitionID(fromStateID, ch, intermediateStateID) + fromStateID = intermediateStateID + } + + toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) + sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) +} + +type Utf8StateId uint32 + +func original(stateId uint32) Utf8StateId { + return predecessor(stateId, 0) +} + +func predecessor(stateId uint32, numSteps uint8) Utf8StateId { + return Utf8StateId(stateId*4 + uint32(numSteps)) +} + +// Utf8DFABuilder makes it possible to define a DFA +// that takes unicode character, and build a `DFA` +// that operates on utf-8 encoded +type Utf8DFABuilder struct { + index []uint32 + distances []Distance + transitions [][256]uint32 + initialState uint32 + numStates uint32 + maxNumStates uint32 +} + +func withMaxStates(maxStates uint32) *Utf8DFABuilder { + rv := &Utf8DFABuilder{ + index: make([]uint32, maxStates*2+100), + distances: make([]Distance, 0, maxStates), + transitions: make([][256]uint32, 0, maxStates), + maxNumStates: maxStates, + } + + for i := range rv.index { + rv.index[i] = math.MaxUint32 + } + + return rv +} + +func (dfab *Utf8DFABuilder) allocate() uint32 { + newState := dfab.numStates + dfab.numStates++ + + dfab.distances = append(dfab.distances, Atleast{d: 255}) + dfab.transitions = append(dfab.transitions, [256]uint32{}) + + return newState +} + +func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { + if int(state) >= cap(dfab.index) { + cloneIndex := make([]uint32, int(state)*2) + copy(cloneIndex, dfab.index) + dfab.index = cloneIndex + } + if dfab.index[state] != math.MaxUint32 { + return dfab.index[state] + } + + nstate := dfab.allocate() + dfab.index[state] = nstate + + return nstate +} + +func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { + decodedID := dfab.getOrAllocate(original(iState)) + dfab.initialState = decodedID +} + +func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { + return &DFA{ + transitions: dfab.transitions, + distances: dfab.distances, + initState: int(dfab.initialState), + ed: ed, + } +} + +func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, + distance Distance) (*Utf8DFAStateBuilder, error) { + if state > dfab.maxNumStates { + return nil, fmt.Errorf("State id is larger than maxNumStates") + } + + stateID := dfab.getOrAllocate(original(state)) + dfab.distances[stateID] = distance + + defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) + // creates a chain of states of predecessors of `default_suc_orig`. + // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` + // leads to the `default_suc_orig` state. + predecessorStates := []uint32{defaultSuccID, + defaultSuccID, + defaultSuccID, + defaultSuccID} + + for numBytes := uint8(1); numBytes < 4; numBytes++ { + predecessorState := predecessor(default_suc_orig, numBytes) + predecessorStateID := dfab.getOrAllocate(predecessorState) + predecessorStates[numBytes] = predecessorStateID + succ := predecessorStates[numBytes-1] + fillTransitions(&dfab.transitions[predecessorStateID], succ) + } + + // 1-byte encoded chars. + fill(dfab.transitions[stateID][0:192], predecessorStates[0]) + // 2-bytes encoded chars. + fill(dfab.transitions[stateID][192:224], predecessorStates[1]) + // 3-bytes encoded chars. + fill(dfab.transitions[stateID][224:240], predecessorStates[2]) + // 4-bytes encoded chars. + fill(dfab.transitions[stateID][240:256], predecessorStates[3]) + + return &Utf8DFAStateBuilder{ + dfaBuilder: dfab, + stateID: stateID, + defaultSuccessor: predecessorStates}, nil +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go new file mode 100644 index 0000000000..1ca0aaa65b --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go @@ -0,0 +1,64 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import "fmt" + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requires too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +// LevenshteinAutomatonBuilder wraps a precomputed +// datastructure that allows to produce small (but not minimal) DFA. +type LevenshteinAutomatonBuilder struct { + pDfa *ParametricDFA +} + +// NewLevenshteinAutomatonBuilder creates a +// reusable, threadsafe Levenshtein automaton builder. +// `maxDistance` - maximum distance considered by the automaton. +// `transposition` - assign a distance of 1 for transposition +// +// Building this automaton builder is computationally intensive. +// While it takes only a few milliseconds for `d=2`, it grows +// exponentially with `d`. It is only reasonable to `d <= 5`. +func NewLevenshteinAutomatonBuilder(maxDistance uint8, + transposition bool) (*LevenshteinAutomatonBuilder, error) { + lnfa := newLevenshtein(maxDistance, transposition) + + pdfa, err := fromNfa(lnfa) + if err != nil { + return nil, err + } + + return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil +} + +// BuildDfa builds the levenshtein automaton for serving +// queries with a given edit distance. +func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string, + fuzziness uint8) (*DFA, error) { + return lab.pDfa.buildDfa(query, fuzziness, false) +} + +// MaxDistance returns the MaxEdit distance supported by the +// LevenshteinAutomatonBuilder builder. +func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 { + return lab.pDfa.maxDistance +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go new file mode 100644 index 0000000000..bed9b99d56 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go @@ -0,0 +1,292 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "math" + "sort" +) + +/// Levenshtein Distance computed by a Levenshtein Automaton. +/// +/// Levenshtein automata can only compute the exact Levenshtein distance +/// up to a given `max_distance`. +/// +/// Over this distance, the automaton will invariably +/// return `Distance::AtLeast(max_distance + 1)`. +type Distance interface { + distance() uint8 +} + +type Exact struct { + d uint8 +} + +func (e Exact) distance() uint8 { + return e.d +} + +type Atleast struct { + d uint8 +} + +func (a Atleast) distance() uint8 { + return a.d +} + +func characteristicVector(query []rune, c rune) uint64 { + chi := uint64(0) + for i := 0; i < len(query); i++ { + if query[i] == c { + chi |= 1 << uint64(i) + } + } + return chi +} + +type NFAState struct { + Offset uint32 + Distance uint8 + InTranspose bool +} + +type NFAStates []NFAState + +func (ns NFAStates) Len() int { + return len(ns) +} + +func (ns NFAStates) Less(i, j int) bool { + if ns[i].Offset != ns[j].Offset { + return ns[i].Offset < ns[j].Offset + } + + if ns[i].Distance != ns[j].Distance { + return ns[i].Distance < ns[j].Distance + } + + return !ns[i].InTranspose && ns[j].InTranspose +} + +func (ns NFAStates) Swap(i, j int) { + ns[i], ns[j] = ns[j], ns[i] +} + +func (ns *NFAState) imply(other NFAState) bool { + transposeImply := ns.InTranspose + if !other.InTranspose { + transposeImply = !other.InTranspose + } + + deltaOffset := ns.Offset - other.Offset + if ns.Offset < other.Offset { + deltaOffset = other.Offset - ns.Offset + } + + if transposeImply { + return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset) + } + + return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset) +} + +type MultiState struct { + states []NFAState +} + +func (ms *MultiState) States() []NFAState { + return ms.states +} + +func (ms *MultiState) Clear() { + ms.states = ms.states[:0] +} + +func newMultiState() *MultiState { + return &MultiState{states: make([]NFAState, 0)} +} + +func (ms *MultiState) normalize() uint32 { + minOffset := uint32(math.MaxUint32) + + for _, s := range ms.states { + if s.Offset < minOffset { + minOffset = s.Offset + } + } + if minOffset == uint32(math.MaxUint32) { + minOffset = 0 + } + + for i := 0; i < len(ms.states); i++ { + ms.states[i].Offset -= minOffset + } + + sort.Sort(NFAStates(ms.states)) + + return minOffset +} + +func (ms *MultiState) addStates(nState NFAState) { + + for _, s := range ms.states { + if s.imply(nState) { + return + } + } + + i := 0 + for i < len(ms.states) { + if nState.imply(ms.states[i]) { + ms.states = append(ms.states[:i], ms.states[i+1:]...) + } else { + i++ + } + } + ms.states = append(ms.states, nState) + +} + +func extractBit(bitset uint64, pos uint8) bool { + shift := bitset >> pos + bit := shift & 1 + return bit == uint64(1) +} + +func dist(left, right uint32) uint32 { + if left > right { + return left - right + } + return right - left +} + +type LevenshteinNFA struct { + mDistance uint8 + damerau bool +} + +func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA { + return &LevenshteinNFA{mDistance: maxD, + damerau: transposition, + } +} + +func (la *LevenshteinNFA) maxDistance() uint8 { + return la.mDistance +} + +func (la *LevenshteinNFA) msDiameter() uint8 { + return 2*la.mDistance + 1 +} + +func (la *LevenshteinNFA) initialStates() *MultiState { + ms := MultiState{} + nfaState := NFAState{} + ms.addStates(nfaState) + return &ms +} + +func (la *LevenshteinNFA) multistateDistance(ms *MultiState, + queryLen uint32) Distance { + minDistance := Atleast{d: la.mDistance + 1} + for _, s := range ms.states { + t := s.Distance + uint8(dist(queryLen, s.Offset)) + if t <= uint8(la.mDistance) { + if minDistance.distance() > t { + minDistance.d = t + } + } + } + + if minDistance.distance() == la.mDistance+1 { + return Atleast{d: la.mDistance + 1} + } + + return minDistance +} + +func (la *LevenshteinNFA) simpleTransition(state NFAState, + symbol uint64, ms *MultiState) { + + if state.Distance < la.mDistance { + // insertion + ms.addStates(NFAState{Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: false}) + + // substitution + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance + 1, + InTranspose: false}) + + n := la.mDistance + 1 - state.Distance + for d := uint8(1); d < n; d++ { + if extractBit(symbol, d) { + // for d > 0, as many deletion and character match + ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d), + Distance: state.Distance + d, + InTranspose: false}) + } + } + + if la.damerau && extractBit(symbol, 1) { + ms.addStates(NFAState{ + Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: true}) + } + + } + + if extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance, + InTranspose: false}) + } + + if state.InTranspose && extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 2, + Distance: state.Distance, + InTranspose: false}) + } + +} + +func (la *LevenshteinNFA) transition(cState *MultiState, + dState *MultiState, scv uint64) { + dState.Clear() + mask := (uint64(1) << la.msDiameter()) - uint64(1) + + for _, state := range cState.states { + cv := (scv >> state.Offset) & mask + la.simpleTransition(state, cv, dState) + } + + sort.Sort(NFAStates(dState.states)) +} + +func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance { + cState := la.initialStates() + nState := newMultiState() + + for _, i := range other { + nState.Clear() + chi := characteristicVector(query, i) + la.transition(cState, nState, chi) + cState, nState = nState, cState + } + + return la.multistateDistance(cState, uint32(len(query))) +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go new file mode 100644 index 0000000000..ebd9311959 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go @@ -0,0 +1,349 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "crypto/md5" + "encoding/json" + "fmt" + "math" +) + +type ParametricState struct { + shapeID uint32 + offset uint32 +} + +func newParametricState() ParametricState { + return ParametricState{} +} + +func (ps *ParametricState) isDeadEnd() bool { + return ps.shapeID == 0 +} + +type Transition struct { + destShapeID uint32 + deltaOffset uint32 +} + +func (t *Transition) apply(state ParametricState) ParametricState { + ps := ParametricState{ + shapeID: t.destShapeID} + // don't need any offset if we are in the dead state, + // this ensures we have only one dead state. + if t.destShapeID != 0 { + ps.offset = state.offset + t.deltaOffset + } + + return ps +} + +type ParametricStateIndex struct { + stateIndex []uint32 + stateQueue []ParametricState + numOffsets uint32 +} + +func newParametricStateIndex(queryLen, + numParamState uint32) ParametricStateIndex { + numOffsets := queryLen + 1 + if numParamState == 0 { + numParamState = numOffsets + } + maxNumStates := numParamState * numOffsets + psi := ParametricStateIndex{ + stateIndex: make([]uint32, maxNumStates), + stateQueue: make([]ParametricState, 0, 150), + numOffsets: numOffsets, + } + + for i := uint32(0); i < maxNumStates; i++ { + psi.stateIndex[i] = math.MaxUint32 + } + return psi +} + +func (psi *ParametricStateIndex) numStates() int { + return len(psi.stateQueue) +} + +func (psi *ParametricStateIndex) maxNumStates() int { + return len(psi.stateIndex) +} + +func (psi *ParametricStateIndex) get(stateID uint32) ParametricState { + return psi.stateQueue[stateID] +} + +func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 { + bucket := ps.shapeID*psi.numOffsets + ps.offset + if bucket < uint32(len(psi.stateIndex)) && + psi.stateIndex[bucket] != math.MaxUint32 { + return psi.stateIndex[bucket] + } + nState := uint32(len(psi.stateQueue)) + psi.stateQueue = append(psi.stateQueue, ps) + + psi.stateIndex[bucket] = nState + return nState +} + +type ParametricDFA struct { + distance []uint8 + transitions []Transition + maxDistance uint8 + transitionStride uint32 + diameter uint32 +} + +func (pdfa *ParametricDFA) initialState() ParametricState { + return ParametricState{shapeID: 1} +} + +// Returns true iff whatever characters come afterward, +// we will never reach a shorter distance +func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool { + if state.isDeadEnd() { + return true + } + + remOffset := queryLen - state.offset + if remOffset < pdfa.diameter { + stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:] + prefixDistance := stateDistances[remOffset] + if prefixDistance > pdfa.maxDistance { + return false + } + + for _, d := range stateDistances { + if d < prefixDistance { + return false + } + } + return true + } + return false +} + +func (pdfa *ParametricDFA) numStates() int { + return len(pdfa.transitions) / int(pdfa.transitionStride) +} + +func min(x, y uint32) uint32 { + if x < y { + return x + } + return y +} + +func (pdfa *ParametricDFA) transition(state ParametricState, + chi uint32) Transition { + return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi] +} + +func (pdfa *ParametricDFA) getDistance(state ParametricState, + qLen uint32) Distance { + remainingOffset := qLen - state.offset + if state.isDeadEnd() || remainingOffset >= pdfa.diameter { + return Atleast{d: pdfa.maxDistance + 1} + } + dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)] + if dist > pdfa.maxDistance { + return Atleast{d: dist} + } + return Exact{d: dist} +} + +func (pdfa *ParametricDFA) computeDistance(left, right string) Distance { + state := pdfa.initialState() + leftChars := []rune(left) + for _, chr := range []rune(right) { + start := state.offset + stop := min(start+pdfa.diameter, uint32(len(leftChars))) + chi := characteristicVector(leftChars[start:stop], chr) + transition := pdfa.transition(state, uint32(chi)) + state = transition.apply(state) + if state.isDeadEnd() { + return Atleast{d: pdfa.maxDistance + 1} + } + } + return pdfa.getDistance(state, uint32(len(left))) +} + +func (pdfa *ParametricDFA) buildDfa(query string, distance uint8, + prefix bool) (*DFA, error) { + qLen := uint32(len([]rune(query))) + alphabet := queryChars(query) + + psi := newParametricStateIndex(qLen, uint32(pdfa.numStates())) + maxNumStates := psi.maxNumStates() + deadEndStateID := psi.getOrAllocate(newParametricState()) + if deadEndStateID != 0 { + return nil, fmt.Errorf("Invalid dead end state") + } + + initialStateID := psi.getOrAllocate(pdfa.initialState()) + dfaBuilder := withMaxStates(uint32(maxNumStates)) + mask := uint32((1 << pdfa.diameter) - 1) + + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == psi.numStates() { + break + } + state := psi.get(uint32(stateID)) + if prefix && pdfa.isPrefixSink(state, qLen) { + distance := pdfa.getDistance(state, qLen) + dfaBuilder.addState(uint32(stateID), uint32(stateID), distance) + } else { + transition := pdfa.transition(state, 0) + defSuccessor := transition.apply(state) + defSuccessorID := psi.getOrAllocate(defSuccessor) + distance := pdfa.getDistance(state, qLen) + stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance) + + if err != nil { + return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err) + } + + alphabet.resetNext() + chr, cv, err := alphabet.next() + for err == nil { + chi := cv.shiftAndMask(state.offset, mask) + + transition := pdfa.transition(state, chi) + + destState := transition.apply(state) + + destStateID := psi.getOrAllocate(destState) + + stateBuilder.addTransition(chr, destStateID) + + chr, cv, err = alphabet.next() + } + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + dfaBuilder.setInitialState(initialStateID) + return dfaBuilder.build(distance), nil +} + +func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { + lookUp := newHash() + lookUp.getOrAllocate(*newMultiState()) + initialState := nfa.initialStates() + lookUp.getOrAllocate(*initialState) + + maxDistance := nfa.maxDistance() + msDiameter := nfa.msDiameter() + + numChi := 1 << msDiameter + chiValues := make([]uint64, numChi) + for i := 0; i < numChi; i++ { + chiValues[i] = uint64(i) + } + + transitions := make([]Transition, 0, numChi*int(msDiameter)) + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == len(lookUp.items) { + break + } + + for _, chi := range chiValues { + destMs := newMultiState() + + ms := lookUp.getFromID(stateID) + + nfa.transition(ms, destMs, chi) + + translation := destMs.normalize() + + destID := lookUp.getOrAllocate(*destMs) + + transitions = append(transitions, Transition{ + destShapeID: uint32(destID), + deltaOffset: translation, + }) + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + ns := len(lookUp.items) + diameter := int(msDiameter) + + distances := make([]uint8, 0, diameter*ns) + for stateID := 0; stateID < ns; stateID++ { + ms := lookUp.getFromID(stateID) + for offset := 0; offset < diameter; offset++ { + dist := nfa.multistateDistance(ms, uint32(offset)) + distances = append(distances, dist.distance()) + } + } + + return &ParametricDFA{ + diameter: uint32(msDiameter), + transitions: transitions, + maxDistance: maxDistance, + transitionStride: uint32(numChi), + distance: distances, + }, nil +} + +type hash struct { + index map[[16]byte]int + items []MultiState +} + +func newHash() *hash { + return &hash{ + index: make(map[[16]byte]int, 100), + items: make([]MultiState, 0, 100), + } +} + +func (h *hash) getOrAllocate(m MultiState) int { + size := len(h.items) + var exists bool + var pos int + md5 := getHash(&m) + if pos, exists = h.index[md5]; !exists { + h.index[md5] = size + pos = size + h.items = append(h.items, m) + } + return pos +} + +func (h *hash) getFromID(id int) *MultiState { + return &h.items[id] +} + +func getHash(ms *MultiState) [16]byte { + msBytes := []byte{} + for _, state := range ms.states { + jsonBytes, _ := json.Marshal(&state) + msBytes = append(msBytes, jsonBytes...) + } + return md5.Sum(msBytes) +} |