From 9591185c8f45fefb39e84aa465a943b24ad60a26 Mon Sep 17 00:00:00 2001 From: Mura Li Date: Wed, 27 Nov 2019 17:23:33 +0800 Subject: Upgrade blevesearch to v0.8.1 (#9177) For #1441 https://github.com/blevesearch/bleve/commit/a91b427b59b893f112021841ba7370d285f8426f --- vendor/github.com/couchbase/vellum/fst_iterator.go | 62 ++-- vendor/github.com/couchbase/vellum/go.mod | 10 + vendor/github.com/couchbase/vellum/go.sum | 39 +++ .../couchbase/vellum/levenshtein/LICENSE | 203 ++++++++++++ .../couchbase/vellum/levenshtein/README.md | 33 ++ .../couchbase/vellum/levenshtein/alphabet.go | 125 ++++++++ .../github.com/couchbase/vellum/levenshtein/dfa.go | 250 +++++++++++++++ .../couchbase/vellum/levenshtein/levenshtein.go | 64 ++++ .../vellum/levenshtein/levenshtein_nfa.go | 292 +++++++++++++++++ .../couchbase/vellum/levenshtein/parametric_dfa.go | 349 +++++++++++++++++++++ .../couchbase/vellum/levenshtein2/LICENSE | 203 ------------ .../couchbase/vellum/levenshtein2/README.md | 33 -- .../couchbase/vellum/levenshtein2/alphabet.go | 125 -------- .../couchbase/vellum/levenshtein2/dfa.go | 250 --------------- .../couchbase/vellum/levenshtein2/levenshtein.go | 64 ---- .../vellum/levenshtein2/levenshtein_nfa.go | 292 ----------------- .../vellum/levenshtein2/parametric_dfa.go | 349 --------------------- .../github.com/couchbase/vellum/regexp/compile.go | 26 +- 18 files changed, 1425 insertions(+), 1344 deletions(-) create mode 100644 vendor/github.com/couchbase/vellum/go.mod create mode 100644 vendor/github.com/couchbase/vellum/go.sum create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/LICENSE create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/README.md create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/alphabet.go create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/dfa.go create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go create mode 100644 vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/LICENSE delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/README.md delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/dfa.go delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go delete mode 100644 vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go (limited to 'vendor/github.com/couchbase') diff --git a/vendor/github.com/couchbase/vellum/fst_iterator.go b/vendor/github.com/couchbase/vellum/fst_iterator.go index eb731395b2..d04ad63fb1 100644 --- a/vendor/github.com/couchbase/vellum/fst_iterator.go +++ b/vendor/github.com/couchbase/vellum/fst_iterator.go @@ -18,7 +18,7 @@ import ( "bytes" ) -// Iterator represents a means of visity key/value pairs in order. +// Iterator represents a means of visiting key/value pairs in order. type Iterator interface { // Current() returns the key/value pair currently pointed to. @@ -186,20 +186,29 @@ func (i *FSTIterator) Next() error { } func (i *FSTIterator) next(lastOffset int) error { - // remember where we started + // remember where we started with keysStack in this next() call i.nextStart = append(i.nextStart[:0], i.keysStack...) nextOffset := lastOffset + 1 + allowCompare := false OUTER: for true { curr := i.statesStack[len(i.statesStack)-1] autCurr := i.autStatesStack[len(i.autStatesStack)-1] - if curr.Final() && i.aut.IsMatch(autCurr) && - bytes.Compare(i.keysStack, i.nextStart) > 0 { - // in final state greater than start key - return nil + if curr.Final() && i.aut.IsMatch(autCurr) && allowCompare { + // check to see if new keystack might have gone too far + if i.endKeyExclusive != nil && + bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { + return ErrIteratorDone + } + + cmp := bytes.Compare(i.keysStack, i.nextStart) + if cmp > 0 { + // in final state greater than start key + return nil + } } numTrans := curr.NumTransitions() @@ -207,8 +216,12 @@ OUTER: INNER: for nextOffset < numTrans { t := curr.TransitionAt(nextOffset) + autNext := i.aut.Accept(autCurr, t) if !i.aut.CanMatch(autNext) { + // TODO: potential optimization to skip nextOffset + // forwards more directly to something that the + // automaton likes rather than a linear scan? nextOffset += 1 continue INNER } @@ -234,30 +247,41 @@ OUTER: i.valsStack = append(i.valsStack, v) i.autStatesStack = append(i.autStatesStack, autNext) - // check to see if new keystack might have gone too far - if i.endKeyExclusive != nil && - bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 { - return ErrIteratorDone - } - nextOffset = 0 + allowCompare = true + continue OUTER } + // no more transitions, so need to backtrack and stack pop if len(i.statesStack) <= 1 { // stack len is 1 (root), can't go back further, we're done break } - // no transitions, and still room to pop - i.statesStack = i.statesStack[:len(i.statesStack)-1] - i.keysStack = i.keysStack[:len(i.keysStack)-1] + // if the top of the stack represents a linear chain of states + // (i.e., a suffix of nodes linked by single transitions), + // then optimize by popping the suffix in one shot without + // going back all the way to the OUTER loop + var popNum int + for j := len(i.statesStack) - 1; j > 0; j-- { + if i.statesStack[j].NumTransitions() != 1 { + popNum = len(i.statesStack) - 1 - j + break + } + } + if popNum < 1 { // always pop at least 1 entry from the stacks + popNum = 1 + } - nextOffset = i.keysPosStack[len(i.keysPosStack)-1] + 1 + nextOffset = i.keysPosStack[len(i.keysPosStack)-popNum] + 1 + allowCompare = false - i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-1] - i.valsStack = i.valsStack[:len(i.valsStack)-1] - i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-1] + i.statesStack = i.statesStack[:len(i.statesStack)-popNum] + i.keysStack = i.keysStack[:len(i.keysStack)-popNum] + i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-popNum] + i.valsStack = i.valsStack[:len(i.valsStack)-popNum] + i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-popNum] } return ErrIteratorDone diff --git a/vendor/github.com/couchbase/vellum/go.mod b/vendor/github.com/couchbase/vellum/go.mod new file mode 100644 index 0000000000..0e304159d4 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/go.mod @@ -0,0 +1,10 @@ +module github.com/couchbase/vellum + +go 1.12 + +require ( + github.com/edsrzf/mmap-go v1.0.0 + github.com/spf13/cobra v0.0.5 + github.com/willf/bitset v1.1.10 + golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a // indirect +) diff --git a/vendor/github.com/couchbase/vellum/go.sum b/vendor/github.com/couchbase/vellum/go.sum new file mode 100644 index 0000000000..f14998530d --- /dev/null +++ b/vendor/github.com/couchbase/vellum/go.sum @@ -0,0 +1,39 @@ +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= +github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= +github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= +github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/edsrzf/mmap-go v1.0.0 h1:CEBF7HpRnUCSJgGUb5h1Gm7e3VkmVDrR8lvWVLtrOFw= +github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= +github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= +github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= +github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= +github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= +github.com/spf13/cobra v0.0.5 h1:f0B+LkLX6DtmRH1isoNA9VTtNUK9K8xYd28JNNfOv/s= +github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU= +github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= +github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= +github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= +github.com/willf/bitset v1.1.10 h1:NotGKqX0KwQ72NUzqrjZq5ipPNDQex9lo3WpaS8L2sc= +github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4= +github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a h1:aYOabOQFp6Vj6W1F80affTUvO9UxmJRx8K0gsfABByQ= +golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/vendor/github.com/couchbase/vellum/levenshtein/LICENSE b/vendor/github.com/couchbase/vellum/levenshtein/LICENSE new file mode 100644 index 0000000000..6b0b1270ff --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/vendor/github.com/couchbase/vellum/levenshtein/README.md b/vendor/github.com/couchbase/vellum/levenshtein/README.md new file mode 100644 index 0000000000..582b69c77e --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/README.md @@ -0,0 +1,33 @@ +# levenshtein +levenshtein automaton + +This package makes it fast and simple to build a finite determinic automaton that computes the levenshtein distance from a given string. + +# Sample usage: + +``` +// build a re-usable builder +lb := NewLevenshteinAutomatonBuilder(2, false) + +origTerm := "couchbasefts" +dfa := lb.BuildDfa("couchbases", 2) +ed := dfa.eval([]byte(origTerm)) +if ed.distance() != 2 { + log.Errorf("expected distance 2, actual: %d", ed.distance()) +} + +``` + +This implementation is inspired by [blog post](https://fulmicoton.com/posts/levenshtein/) and is intended to be +a port of original rust implementation: https://github.com/tantivy-search/levenshtein-automata + + +Micro Benchmark Results against the current vellum/levenshtein is as below. + +``` +BenchmarkNewEditDistance1-8 30000 52684 ns/op 89985 B/op 295 allocs/op +BenchmarkOlderEditDistance1-8 10000 132931 ns/op 588892 B/op 363 allocs/op + +BenchmarkNewEditDistance2-8 10000 199127 ns/op 377532 B/op 1019 allocs/op +BenchmarkOlderEditDistance2-8 2000 988109 ns/op 4236609 B/op 1898 allocs/op +``` diff --git a/vendor/github.com/couchbase/vellum/levenshtein/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein/alphabet.go new file mode 100644 index 0000000000..ec285129ca --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/alphabet.go @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +type FullCharacteristicVector []uint32 + +func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { + bucketID := offset / 32 + align := offset - bucketID*32 + if align == 0 { + return fcv[bucketID] & mask + } + left := fcv[bucketID] >> align + right := fcv[bucketID+1] << (32 - align) + return (left | right) & mask +} + +type tuple struct { + char rune + fcv FullCharacteristicVector +} + +type sortRunes []rune + +func (s sortRunes) Less(i, j int) bool { + return s[i] < s[j] +} + +func (s sortRunes) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortRunes) Len() int { + return len(s) +} + +func sortRune(r []rune) []rune { + sort.Sort(sortRunes(r)) + return r +} + +type Alphabet struct { + charset []tuple + index uint32 +} + +func (a *Alphabet) resetNext() { + a.index = 0 +} + +func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { + if int(a.index) >= len(a.charset) { + return 0, nil, fmt.Errorf("eof") + } + + rv := a.charset[a.index] + a.index++ + return rv.char, rv.fcv, nil +} + +func dedupe(in string) string { + lookUp := make(map[rune]struct{}, len(in)) + var rv string + for len(in) > 0 { + r, size := utf8.DecodeRuneInString(in) + in = in[size:] + if _, ok := lookUp[r]; !ok { + rv += string(r) + lookUp[r] = struct{}{} + } + } + return rv +} + +func queryChars(qChars string) Alphabet { + chars := dedupe(qChars) + inChars := sortRune([]rune(chars)) + charsets := make([]tuple, 0, len(inChars)) + + for _, c := range inChars { + tempChars := qChars + var bits []uint32 + for len(tempChars) > 0 { + var chunk string + if len(tempChars) > 32 { + chunk = tempChars[0:32] + tempChars = tempChars[32:] + } else { + chunk = tempChars + tempChars = tempChars[:0] + } + + chunkBits := uint32(0) + bit := uint32(1) + for _, chr := range chunk { + if chr == c { + chunkBits |= bit + } + bit <<= 1 + } + bits = append(bits, chunkBits) + } + bits = append(bits, 0) + charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) + } + return Alphabet{charset: charsets} +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein/dfa.go b/vendor/github.com/couchbase/vellum/levenshtein/dfa.go new file mode 100644 index 0000000000..d0e43cac24 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/dfa.go @@ -0,0 +1,250 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein + +import ( + "fmt" + "math" +) + +const SinkState = uint32(0) + +type DFA struct { + transitions [][256]uint32 + distances []Distance + initState int + ed uint8 +} + +/// Returns the initial state +func (d *DFA) initialState() int { + return d.initState +} + +/// Returns the Levenshtein distance associated to the +/// current state. +func (d *DFA) distance(stateId int) Distance { + return d.distances[stateId] +} + +/// Returns the number of states in the `DFA`. +func (d *DFA) numStates() int { + return len(d.transitions) +} + +/// Returns the destination state reached after consuming a given byte. +func (d *DFA) transition(fromState int, b uint8) int { + return int(d.transitions[fromState][b]) +} + +func (d *DFA) eval(bytes []uint8) Distance { + state := d.initialState() + + for _, b := range bytes { + state = d.transition(state, b) + } + + return d.distance(state) +} + +func (d *DFA) Start() int { + return int(d.initialState()) +} + +func (d *DFA) IsMatch(state int) bool { + if _, ok := d.distance(state).(Exact); ok { + return true + } + return false +} + +func (d *DFA) CanMatch(state int) bool { + return state > 0 && state < d.numStates() +} + +func (d *DFA) Accept(state int, b byte) int { + return int(d.transition(state, b)) +} + +// WillAlwaysMatch returns if the specified state will always end in a +// matching state. +func (d *DFA) WillAlwaysMatch(state int) bool { + return false +} + +func fill(dest []uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +func fillTransitions(dest *[256]uint32, val uint32) { + for i := range dest { + dest[i] = val + } +} + +type Utf8DFAStateBuilder struct { + dfaBuilder *Utf8DFABuilder + stateID uint32 + defaultSuccessor []uint32 +} + +func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, + toStateID uint32) { + sb.dfaBuilder.transitions[fromStateID][b] = toStateID +} + +func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { + fromStateID := sb.stateID + chars := []byte(string(in)) + lastByte := chars[len(chars)-1] + + for i, ch := range chars[:len(chars)-1] { + remNumBytes := len(chars) - i - 1 + defaultSuccessor := sb.defaultSuccessor[remNumBytes] + intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] + + if intermediateStateID == defaultSuccessor { + intermediateStateID = sb.dfaBuilder.allocate() + fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], + sb.defaultSuccessor[remNumBytes-1]) + } + + sb.addTransitionID(fromStateID, ch, intermediateStateID) + fromStateID = intermediateStateID + } + + toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) + sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) +} + +type Utf8StateId uint32 + +func original(stateId uint32) Utf8StateId { + return predecessor(stateId, 0) +} + +func predecessor(stateId uint32, numSteps uint8) Utf8StateId { + return Utf8StateId(stateId*4 + uint32(numSteps)) +} + +// Utf8DFABuilder makes it possible to define a DFA +// that takes unicode character, and build a `DFA` +// that operates on utf-8 encoded +type Utf8DFABuilder struct { + index []uint32 + distances []Distance + transitions [][256]uint32 + initialState uint32 + numStates uint32 + maxNumStates uint32 +} + +func withMaxStates(maxStates uint32) *Utf8DFABuilder { + rv := &Utf8DFABuilder{ + index: make([]uint32, maxStates*2+100), + distances: make([]Distance, 0, maxStates), + transitions: make([][256]uint32, 0, maxStates), + maxNumStates: maxStates, + } + + for i := range rv.index { + rv.index[i] = math.MaxUint32 + } + + return rv +} + +func (dfab *Utf8DFABuilder) allocate() uint32 { + newState := dfab.numStates + dfab.numStates++ + + dfab.distances = append(dfab.distances, Atleast{d: 255}) + dfab.transitions = append(dfab.transitions, [256]uint32{}) + + return newState +} + +func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { + if int(state) >= cap(dfab.index) { + cloneIndex := make([]uint32, int(state)*2) + copy(cloneIndex, dfab.index) + dfab.index = cloneIndex + } + if dfab.index[state] != math.MaxUint32 { + return dfab.index[state] + } + + nstate := dfab.allocate() + dfab.index[state] = nstate + + return nstate +} + +func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { + decodedID := dfab.getOrAllocate(original(iState)) + dfab.initialState = decodedID +} + +func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { + return &DFA{ + transitions: dfab.transitions, + distances: dfab.distances, + initState: int(dfab.initialState), + ed: ed, + } +} + +func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, + distance Distance) (*Utf8DFAStateBuilder, error) { + if state > dfab.maxNumStates { + return nil, fmt.Errorf("State id is larger than maxNumStates") + } + + stateID := dfab.getOrAllocate(original(state)) + dfab.distances[stateID] = distance + + defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) + // creates a chain of states of predecessors of `default_suc_orig`. + // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` + // leads to the `default_suc_orig` state. + predecessorStates := []uint32{defaultSuccID, + defaultSuccID, + defaultSuccID, + defaultSuccID} + + for numBytes := uint8(1); numBytes < 4; numBytes++ { + predecessorState := predecessor(default_suc_orig, numBytes) + predecessorStateID := dfab.getOrAllocate(predecessorState) + predecessorStates[numBytes] = predecessorStateID + succ := predecessorStates[numBytes-1] + fillTransitions(&dfab.transitions[predecessorStateID], succ) + } + + // 1-byte encoded chars. + fill(dfab.transitions[stateID][0:192], predecessorStates[0]) + // 2-bytes encoded chars. + fill(dfab.transitions[stateID][192:224], predecessorStates[1]) + // 3-bytes encoded chars. + fill(dfab.transitions[stateID][224:240], predecessorStates[2]) + // 4-bytes encoded chars. + fill(dfab.transitions[stateID][240:256], predecessorStates[3]) + + return &Utf8DFAStateBuilder{ + dfaBuilder: dfab, + stateID: stateID, + defaultSuccessor: predecessorStates}, nil +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go b/vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go new file mode 100644 index 0000000000..aa652df844 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/levenshtein.go @@ -0,0 +1,64 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein + +import "fmt" + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requires too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +// LevenshteinAutomatonBuilder wraps a precomputed +// datastructure that allows to produce small (but not minimal) DFA. +type LevenshteinAutomatonBuilder struct { + pDfa *ParametricDFA +} + +// NewLevenshteinAutomatonBuilder creates a +// reusable, threadsafe Levenshtein automaton builder. +// `maxDistance` - maximum distance considered by the automaton. +// `transposition` - assign a distance of 1 for transposition +// +// Building this automaton builder is computationally intensive. +// While it takes only a few milliseconds for `d=2`, it grows +// exponentially with `d`. It is only reasonable to `d <= 5`. +func NewLevenshteinAutomatonBuilder(maxDistance uint8, + transposition bool) (*LevenshteinAutomatonBuilder, error) { + lnfa := newLevenshtein(maxDistance, transposition) + + pdfa, err := fromNfa(lnfa) + if err != nil { + return nil, err + } + + return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil +} + +// BuildDfa builds the levenshtein automaton for serving +// queries with a given edit distance. +func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string, + fuzziness uint8) (*DFA, error) { + return lab.pDfa.buildDfa(query, fuzziness, false) +} + +// MaxDistance returns the MaxEdit distance supported by the +// LevenshteinAutomatonBuilder builder. +func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 { + return lab.pDfa.maxDistance +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go b/vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go new file mode 100644 index 0000000000..68db5d191c --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/levenshtein_nfa.go @@ -0,0 +1,292 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein + +import ( + "math" + "sort" +) + +/// Levenshtein Distance computed by a Levenshtein Automaton. +/// +/// Levenshtein automata can only compute the exact Levenshtein distance +/// up to a given `max_distance`. +/// +/// Over this distance, the automaton will invariably +/// return `Distance::AtLeast(max_distance + 1)`. +type Distance interface { + distance() uint8 +} + +type Exact struct { + d uint8 +} + +func (e Exact) distance() uint8 { + return e.d +} + +type Atleast struct { + d uint8 +} + +func (a Atleast) distance() uint8 { + return a.d +} + +func characteristicVector(query []rune, c rune) uint64 { + chi := uint64(0) + for i := 0; i < len(query); i++ { + if query[i] == c { + chi |= 1 << uint64(i) + } + } + return chi +} + +type NFAState struct { + Offset uint32 + Distance uint8 + InTranspose bool +} + +type NFAStates []NFAState + +func (ns NFAStates) Len() int { + return len(ns) +} + +func (ns NFAStates) Less(i, j int) bool { + if ns[i].Offset != ns[j].Offset { + return ns[i].Offset < ns[j].Offset + } + + if ns[i].Distance != ns[j].Distance { + return ns[i].Distance < ns[j].Distance + } + + return !ns[i].InTranspose && ns[j].InTranspose +} + +func (ns NFAStates) Swap(i, j int) { + ns[i], ns[j] = ns[j], ns[i] +} + +func (ns *NFAState) imply(other NFAState) bool { + transposeImply := ns.InTranspose + if !other.InTranspose { + transposeImply = !other.InTranspose + } + + deltaOffset := ns.Offset - other.Offset + if ns.Offset < other.Offset { + deltaOffset = other.Offset - ns.Offset + } + + if transposeImply { + return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset) + } + + return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset) +} + +type MultiState struct { + states []NFAState +} + +func (ms *MultiState) States() []NFAState { + return ms.states +} + +func (ms *MultiState) Clear() { + ms.states = ms.states[:0] +} + +func newMultiState() *MultiState { + return &MultiState{states: make([]NFAState, 0)} +} + +func (ms *MultiState) normalize() uint32 { + minOffset := uint32(math.MaxUint32) + + for _, s := range ms.states { + if s.Offset < minOffset { + minOffset = s.Offset + } + } + if minOffset == uint32(math.MaxUint32) { + minOffset = 0 + } + + for i := 0; i < len(ms.states); i++ { + ms.states[i].Offset -= minOffset + } + + sort.Sort(NFAStates(ms.states)) + + return minOffset +} + +func (ms *MultiState) addStates(nState NFAState) { + + for _, s := range ms.states { + if s.imply(nState) { + return + } + } + + i := 0 + for i < len(ms.states) { + if nState.imply(ms.states[i]) { + ms.states = append(ms.states[:i], ms.states[i+1:]...) + } else { + i++ + } + } + ms.states = append(ms.states, nState) + +} + +func extractBit(bitset uint64, pos uint8) bool { + shift := bitset >> pos + bit := shift & 1 + return bit == uint64(1) +} + +func dist(left, right uint32) uint32 { + if left > right { + return left - right + } + return right - left +} + +type LevenshteinNFA struct { + mDistance uint8 + damerau bool +} + +func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA { + return &LevenshteinNFA{mDistance: maxD, + damerau: transposition, + } +} + +func (la *LevenshteinNFA) maxDistance() uint8 { + return la.mDistance +} + +func (la *LevenshteinNFA) msDiameter() uint8 { + return 2*la.mDistance + 1 +} + +func (la *LevenshteinNFA) initialStates() *MultiState { + ms := MultiState{} + nfaState := NFAState{} + ms.addStates(nfaState) + return &ms +} + +func (la *LevenshteinNFA) multistateDistance(ms *MultiState, + queryLen uint32) Distance { + minDistance := Atleast{d: la.mDistance + 1} + for _, s := range ms.states { + t := s.Distance + uint8(dist(queryLen, s.Offset)) + if t <= uint8(la.mDistance) { + if minDistance.distance() > t { + minDistance.d = t + } + } + } + + if minDistance.distance() == la.mDistance+1 { + return Atleast{d: la.mDistance + 1} + } + + return minDistance +} + +func (la *LevenshteinNFA) simpleTransition(state NFAState, + symbol uint64, ms *MultiState) { + + if state.Distance < la.mDistance { + // insertion + ms.addStates(NFAState{Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: false}) + + // substitution + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance + 1, + InTranspose: false}) + + n := la.mDistance + 1 - state.Distance + for d := uint8(1); d < n; d++ { + if extractBit(symbol, d) { + // for d > 0, as many deletion and character match + ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d), + Distance: state.Distance + d, + InTranspose: false}) + } + } + + if la.damerau && extractBit(symbol, 1) { + ms.addStates(NFAState{ + Offset: state.Offset, + Distance: state.Distance + 1, + InTranspose: true}) + } + + } + + if extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 1, + Distance: state.Distance, + InTranspose: false}) + } + + if state.InTranspose && extractBit(symbol, 0) { + ms.addStates(NFAState{Offset: state.Offset + 2, + Distance: state.Distance, + InTranspose: false}) + } + +} + +func (la *LevenshteinNFA) transition(cState *MultiState, + dState *MultiState, scv uint64) { + dState.Clear() + mask := (uint64(1) << la.msDiameter()) - uint64(1) + + for _, state := range cState.states { + cv := (scv >> state.Offset) & mask + la.simpleTransition(state, cv, dState) + } + + sort.Sort(NFAStates(dState.states)) +} + +func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance { + cState := la.initialStates() + nState := newMultiState() + + for _, i := range other { + nState.Clear() + chi := characteristicVector(query, i) + la.transition(cState, nState, chi) + cState, nState = nState, cState + } + + return la.multistateDistance(cState, uint32(len(query))) +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go new file mode 100644 index 0000000000..d08e5da639 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein/parametric_dfa.go @@ -0,0 +1,349 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein + +import ( + "crypto/md5" + "encoding/json" + "fmt" + "math" +) + +type ParametricState struct { + shapeID uint32 + offset uint32 +} + +func newParametricState() ParametricState { + return ParametricState{} +} + +func (ps *ParametricState) isDeadEnd() bool { + return ps.shapeID == 0 +} + +type Transition struct { + destShapeID uint32 + deltaOffset uint32 +} + +func (t *Transition) apply(state ParametricState) ParametricState { + ps := ParametricState{ + shapeID: t.destShapeID} + // don't need any offset if we are in the dead state, + // this ensures we have only one dead state. + if t.destShapeID != 0 { + ps.offset = state.offset + t.deltaOffset + } + + return ps +} + +type ParametricStateIndex struct { + stateIndex []uint32 + stateQueue []ParametricState + numOffsets uint32 +} + +func newParametricStateIndex(queryLen, + numParamState uint32) ParametricStateIndex { + numOffsets := queryLen + 1 + if numParamState == 0 { + numParamState = numOffsets + } + maxNumStates := numParamState * numOffsets + psi := ParametricStateIndex{ + stateIndex: make([]uint32, maxNumStates), + stateQueue: make([]ParametricState, 0, 150), + numOffsets: numOffsets, + } + + for i := uint32(0); i < maxNumStates; i++ { + psi.stateIndex[i] = math.MaxUint32 + } + return psi +} + +func (psi *ParametricStateIndex) numStates() int { + return len(psi.stateQueue) +} + +func (psi *ParametricStateIndex) maxNumStates() int { + return len(psi.stateIndex) +} + +func (psi *ParametricStateIndex) get(stateID uint32) ParametricState { + return psi.stateQueue[stateID] +} + +func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 { + bucket := ps.shapeID*psi.numOffsets + ps.offset + if bucket < uint32(len(psi.stateIndex)) && + psi.stateIndex[bucket] != math.MaxUint32 { + return psi.stateIndex[bucket] + } + nState := uint32(len(psi.stateQueue)) + psi.stateQueue = append(psi.stateQueue, ps) + + psi.stateIndex[bucket] = nState + return nState +} + +type ParametricDFA struct { + distance []uint8 + transitions []Transition + maxDistance uint8 + transitionStride uint32 + diameter uint32 +} + +func (pdfa *ParametricDFA) initialState() ParametricState { + return ParametricState{shapeID: 1} +} + +// Returns true iff whatever characters come afterward, +// we will never reach a shorter distance +func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool { + if state.isDeadEnd() { + return true + } + + remOffset := queryLen - state.offset + if remOffset < pdfa.diameter { + stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:] + prefixDistance := stateDistances[remOffset] + if prefixDistance > pdfa.maxDistance { + return false + } + + for _, d := range stateDistances { + if d < prefixDistance { + return false + } + } + return true + } + return false +} + +func (pdfa *ParametricDFA) numStates() int { + return len(pdfa.transitions) / int(pdfa.transitionStride) +} + +func min(x, y uint32) uint32 { + if x < y { + return x + } + return y +} + +func (pdfa *ParametricDFA) transition(state ParametricState, + chi uint32) Transition { + return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi] +} + +func (pdfa *ParametricDFA) getDistance(state ParametricState, + qLen uint32) Distance { + remainingOffset := qLen - state.offset + if state.isDeadEnd() || remainingOffset >= pdfa.diameter { + return Atleast{d: pdfa.maxDistance + 1} + } + dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)] + if dist > pdfa.maxDistance { + return Atleast{d: dist} + } + return Exact{d: dist} +} + +func (pdfa *ParametricDFA) computeDistance(left, right string) Distance { + state := pdfa.initialState() + leftChars := []rune(left) + for _, chr := range []rune(right) { + start := state.offset + stop := min(start+pdfa.diameter, uint32(len(leftChars))) + chi := characteristicVector(leftChars[start:stop], chr) + transition := pdfa.transition(state, uint32(chi)) + state = transition.apply(state) + if state.isDeadEnd() { + return Atleast{d: pdfa.maxDistance + 1} + } + } + return pdfa.getDistance(state, uint32(len(left))) +} + +func (pdfa *ParametricDFA) buildDfa(query string, distance uint8, + prefix bool) (*DFA, error) { + qLen := uint32(len([]rune(query))) + alphabet := queryChars(query) + + psi := newParametricStateIndex(qLen, uint32(pdfa.numStates())) + maxNumStates := psi.maxNumStates() + deadEndStateID := psi.getOrAllocate(newParametricState()) + if deadEndStateID != 0 { + return nil, fmt.Errorf("Invalid dead end state") + } + + initialStateID := psi.getOrAllocate(pdfa.initialState()) + dfaBuilder := withMaxStates(uint32(maxNumStates)) + mask := uint32((1 << pdfa.diameter) - 1) + + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == psi.numStates() { + break + } + state := psi.get(uint32(stateID)) + if prefix && pdfa.isPrefixSink(state, qLen) { + distance := pdfa.getDistance(state, qLen) + dfaBuilder.addState(uint32(stateID), uint32(stateID), distance) + } else { + transition := pdfa.transition(state, 0) + defSuccessor := transition.apply(state) + defSuccessorID := psi.getOrAllocate(defSuccessor) + distance := pdfa.getDistance(state, qLen) + stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance) + + if err != nil { + return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err) + } + + alphabet.resetNext() + chr, cv, err := alphabet.next() + for err == nil { + chi := cv.shiftAndMask(state.offset, mask) + + transition := pdfa.transition(state, chi) + + destState := transition.apply(state) + + destStateID := psi.getOrAllocate(destState) + + stateBuilder.addTransition(chr, destStateID) + + chr, cv, err = alphabet.next() + } + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + dfaBuilder.setInitialState(initialStateID) + return dfaBuilder.build(distance), nil +} + +func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { + lookUp := newHash() + lookUp.getOrAllocate(*newMultiState()) + initialState := nfa.initialStates() + lookUp.getOrAllocate(*initialState) + + maxDistance := nfa.maxDistance() + msDiameter := nfa.msDiameter() + + numChi := 1 << msDiameter + chiValues := make([]uint64, numChi) + for i := 0; i < numChi; i++ { + chiValues[i] = uint64(i) + } + + transitions := make([]Transition, 0, numChi*int(msDiameter)) + var stateID int + for stateID = 0; stateID < StateLimit; stateID++ { + if stateID == len(lookUp.items) { + break + } + + for _, chi := range chiValues { + destMs := newMultiState() + + ms := lookUp.getFromID(stateID) + + nfa.transition(ms, destMs, chi) + + translation := destMs.normalize() + + destID := lookUp.getOrAllocate(*destMs) + + transitions = append(transitions, Transition{ + destShapeID: uint32(destID), + deltaOffset: translation, + }) + } + } + + if stateID == StateLimit { + return nil, ErrTooManyStates + } + + ns := len(lookUp.items) + diameter := int(msDiameter) + + distances := make([]uint8, 0, diameter*ns) + for stateID := 0; stateID < ns; stateID++ { + ms := lookUp.getFromID(stateID) + for offset := 0; offset < diameter; offset++ { + dist := nfa.multistateDistance(ms, uint32(offset)) + distances = append(distances, dist.distance()) + } + } + + return &ParametricDFA{ + diameter: uint32(msDiameter), + transitions: transitions, + maxDistance: maxDistance, + transitionStride: uint32(numChi), + distance: distances, + }, nil +} + +type hash struct { + index map[[16]byte]int + items []MultiState +} + +func newHash() *hash { + return &hash{ + index: make(map[[16]byte]int, 100), + items: make([]MultiState, 0, 100), + } +} + +func (h *hash) getOrAllocate(m MultiState) int { + size := len(h.items) + var exists bool + var pos int + md5 := getHash(&m) + if pos, exists = h.index[md5]; !exists { + h.index[md5] = size + pos = size + h.items = append(h.items, m) + } + return pos +} + +func (h *hash) getFromID(id int) *MultiState { + return &h.items[id] +} + +func getHash(ms *MultiState) [16]byte { + msBytes := []byte{} + for _, state := range ms.states { + jsonBytes, _ := json.Marshal(&state) + msBytes = append(msBytes, jsonBytes...) + } + return md5.Sum(msBytes) +} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE b/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE deleted file mode 100644 index 6b0b1270ff..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/LICENSE +++ /dev/null @@ -1,203 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/README.md b/vendor/github.com/couchbase/vellum/levenshtein2/README.md deleted file mode 100644 index 582b69c77e..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# levenshtein -levenshtein automaton - -This package makes it fast and simple to build a finite determinic automaton that computes the levenshtein distance from a given string. - -# Sample usage: - -``` -// build a re-usable builder -lb := NewLevenshteinAutomatonBuilder(2, false) - -origTerm := "couchbasefts" -dfa := lb.BuildDfa("couchbases", 2) -ed := dfa.eval([]byte(origTerm)) -if ed.distance() != 2 { - log.Errorf("expected distance 2, actual: %d", ed.distance()) -} - -``` - -This implementation is inspired by [blog post](https://fulmicoton.com/posts/levenshtein/) and is intended to be -a port of original rust implementation: https://github.com/tantivy-search/levenshtein-automata - - -Micro Benchmark Results against the current vellum/levenshtein is as below. - -``` -BenchmarkNewEditDistance1-8 30000 52684 ns/op 89985 B/op 295 allocs/op -BenchmarkOlderEditDistance1-8 10000 132931 ns/op 588892 B/op 363 allocs/op - -BenchmarkNewEditDistance2-8 10000 199127 ns/op 377532 B/op 1019 allocs/op -BenchmarkOlderEditDistance2-8 2000 988109 ns/op 4236609 B/op 1898 allocs/op -``` diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go deleted file mode 100644 index 4bf64fef2e..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package levenshtein2 - -import ( - "fmt" - "sort" - "unicode/utf8" -) - -type FullCharacteristicVector []uint32 - -func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { - bucketID := offset / 32 - align := offset - bucketID*32 - if align == 0 { - return fcv[bucketID] & mask - } - left := fcv[bucketID] >> align - right := fcv[bucketID+1] << (32 - align) - return (left | right) & mask -} - -type tuple struct { - char rune - fcv FullCharacteristicVector -} - -type sortRunes []rune - -func (s sortRunes) Less(i, j int) bool { - return s[i] < s[j] -} - -func (s sortRunes) Swap(i, j int) { - s[i], s[j] = s[j], s[i] -} - -func (s sortRunes) Len() int { - return len(s) -} - -func sortRune(r []rune) []rune { - sort.Sort(sortRunes(r)) - return r -} - -type Alphabet struct { - charset []tuple - index uint32 -} - -func (a *Alphabet) resetNext() { - a.index = 0 -} - -func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { - if int(a.index) >= len(a.charset) { - return 0, nil, fmt.Errorf("eof") - } - - rv := a.charset[a.index] - a.index++ - return rv.char, rv.fcv, nil -} - -func dedupe(in string) string { - lookUp := make(map[rune]struct{}, len(in)) - var rv string - for len(in) > 0 { - r, size := utf8.DecodeRuneInString(in) - in = in[size:] - if _, ok := lookUp[r]; !ok { - rv += string(r) - lookUp[r] = struct{}{} - } - } - return rv -} - -func queryChars(qChars string) Alphabet { - chars := dedupe(qChars) - inChars := sortRune([]rune(chars)) - charsets := make([]tuple, 0, len(inChars)) - - for _, c := range inChars { - tempChars := qChars - var bits []uint32 - for len(tempChars) > 0 { - var chunk string - if len(tempChars) > 32 { - chunk = tempChars[0:32] - tempChars = tempChars[32:] - } else { - chunk = tempChars - tempChars = tempChars[:0] - } - - chunkBits := uint32(0) - bit := uint32(1) - for _, chr := range chunk { - if chr == c { - chunkBits |= bit - } - bit <<= 1 - } - bits = append(bits, chunkBits) - } - bits = append(bits, 0) - charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) - } - return Alphabet{charset: charsets} -} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go deleted file mode 100644 index e82a780a52..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/dfa.go +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package levenshtein2 - -import ( - "fmt" - "math" -) - -const SinkState = uint32(0) - -type DFA struct { - transitions [][256]uint32 - distances []Distance - initState int - ed uint8 -} - -/// Returns the initial state -func (d *DFA) initialState() int { - return d.initState -} - -/// Returns the Levenshtein distance associated to the -/// current state. -func (d *DFA) distance(stateId int) Distance { - return d.distances[stateId] -} - -/// Returns the number of states in the `DFA`. -func (d *DFA) numStates() int { - return len(d.transitions) -} - -/// Returns the destination state reached after consuming a given byte. -func (d *DFA) transition(fromState int, b uint8) int { - return int(d.transitions[fromState][b]) -} - -func (d *DFA) eval(bytes []uint8) Distance { - state := d.initialState() - - for _, b := range bytes { - state = d.transition(state, b) - } - - return d.distance(state) -} - -func (d *DFA) Start() int { - return int(d.initialState()) -} - -func (d *DFA) IsMatch(state int) bool { - if _, ok := d.distance(state).(Exact); ok { - return true - } - return false -} - -func (d *DFA) CanMatch(state int) bool { - return state > 0 && state < d.numStates() -} - -func (d *DFA) Accept(state int, b byte) int { - return int(d.transition(state, b)) -} - -// WillAlwaysMatch returns if the specified state will always end in a -// matching state. -func (d *DFA) WillAlwaysMatch(state int) bool { - return false -} - -func fill(dest []uint32, val uint32) { - for i := range dest { - dest[i] = val - } -} - -func fillTransitions(dest *[256]uint32, val uint32) { - for i := range dest { - dest[i] = val - } -} - -type Utf8DFAStateBuilder struct { - dfaBuilder *Utf8DFABuilder - stateID uint32 - defaultSuccessor []uint32 -} - -func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8, - toStateID uint32) { - sb.dfaBuilder.transitions[fromStateID][b] = toStateID -} - -func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) { - fromStateID := sb.stateID - chars := []byte(string(in)) - lastByte := chars[len(chars)-1] - - for i, ch := range chars[:len(chars)-1] { - remNumBytes := len(chars) - i - 1 - defaultSuccessor := sb.defaultSuccessor[remNumBytes] - intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch] - - if intermediateStateID == defaultSuccessor { - intermediateStateID = sb.dfaBuilder.allocate() - fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID], - sb.defaultSuccessor[remNumBytes-1]) - } - - sb.addTransitionID(fromStateID, ch, intermediateStateID) - fromStateID = intermediateStateID - } - - toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID)) - sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded) -} - -type Utf8StateId uint32 - -func original(stateId uint32) Utf8StateId { - return predecessor(stateId, 0) -} - -func predecessor(stateId uint32, numSteps uint8) Utf8StateId { - return Utf8StateId(stateId*4 + uint32(numSteps)) -} - -// Utf8DFABuilder makes it possible to define a DFA -// that takes unicode character, and build a `DFA` -// that operates on utf-8 encoded -type Utf8DFABuilder struct { - index []uint32 - distances []Distance - transitions [][256]uint32 - initialState uint32 - numStates uint32 - maxNumStates uint32 -} - -func withMaxStates(maxStates uint32) *Utf8DFABuilder { - rv := &Utf8DFABuilder{ - index: make([]uint32, maxStates*2+100), - distances: make([]Distance, 0, maxStates), - transitions: make([][256]uint32, 0, maxStates), - maxNumStates: maxStates, - } - - for i := range rv.index { - rv.index[i] = math.MaxUint32 - } - - return rv -} - -func (dfab *Utf8DFABuilder) allocate() uint32 { - newState := dfab.numStates - dfab.numStates++ - - dfab.distances = append(dfab.distances, Atleast{d: 255}) - dfab.transitions = append(dfab.transitions, [256]uint32{}) - - return newState -} - -func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 { - if int(state) >= cap(dfab.index) { - cloneIndex := make([]uint32, int(state)*2) - copy(cloneIndex, dfab.index) - dfab.index = cloneIndex - } - if dfab.index[state] != math.MaxUint32 { - return dfab.index[state] - } - - nstate := dfab.allocate() - dfab.index[state] = nstate - - return nstate -} - -func (dfab *Utf8DFABuilder) setInitialState(iState uint32) { - decodedID := dfab.getOrAllocate(original(iState)) - dfab.initialState = decodedID -} - -func (dfab *Utf8DFABuilder) build(ed uint8) *DFA { - return &DFA{ - transitions: dfab.transitions, - distances: dfab.distances, - initState: int(dfab.initialState), - ed: ed, - } -} - -func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32, - distance Distance) (*Utf8DFAStateBuilder, error) { - if state > dfab.maxNumStates { - return nil, fmt.Errorf("State id is larger than maxNumStates") - } - - stateID := dfab.getOrAllocate(original(state)) - dfab.distances[stateID] = distance - - defaultSuccID := dfab.getOrAllocate(original(default_suc_orig)) - // creates a chain of states of predecessors of `default_suc_orig`. - // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]` - // leads to the `default_suc_orig` state. - predecessorStates := []uint32{defaultSuccID, - defaultSuccID, - defaultSuccID, - defaultSuccID} - - for numBytes := uint8(1); numBytes < 4; numBytes++ { - predecessorState := predecessor(default_suc_orig, numBytes) - predecessorStateID := dfab.getOrAllocate(predecessorState) - predecessorStates[numBytes] = predecessorStateID - succ := predecessorStates[numBytes-1] - fillTransitions(&dfab.transitions[predecessorStateID], succ) - } - - // 1-byte encoded chars. - fill(dfab.transitions[stateID][0:192], predecessorStates[0]) - // 2-bytes encoded chars. - fill(dfab.transitions[stateID][192:224], predecessorStates[1]) - // 3-bytes encoded chars. - fill(dfab.transitions[stateID][224:240], predecessorStates[2]) - // 4-bytes encoded chars. - fill(dfab.transitions[stateID][240:256], predecessorStates[3]) - - return &Utf8DFAStateBuilder{ - dfaBuilder: dfab, - stateID: stateID, - defaultSuccessor: predecessorStates}, nil -} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go deleted file mode 100644 index 1ca0aaa65b..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein.go +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package levenshtein2 - -import "fmt" - -// StateLimit is the maximum number of states allowed -const StateLimit = 10000 - -// ErrTooManyStates is returned if you attempt to build a Levenshtein -// automaton which requires too many states. -var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", - StateLimit) - -// LevenshteinAutomatonBuilder wraps a precomputed -// datastructure that allows to produce small (but not minimal) DFA. -type LevenshteinAutomatonBuilder struct { - pDfa *ParametricDFA -} - -// NewLevenshteinAutomatonBuilder creates a -// reusable, threadsafe Levenshtein automaton builder. -// `maxDistance` - maximum distance considered by the automaton. -// `transposition` - assign a distance of 1 for transposition -// -// Building this automaton builder is computationally intensive. -// While it takes only a few milliseconds for `d=2`, it grows -// exponentially with `d`. It is only reasonable to `d <= 5`. -func NewLevenshteinAutomatonBuilder(maxDistance uint8, - transposition bool) (*LevenshteinAutomatonBuilder, error) { - lnfa := newLevenshtein(maxDistance, transposition) - - pdfa, err := fromNfa(lnfa) - if err != nil { - return nil, err - } - - return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil -} - -// BuildDfa builds the levenshtein automaton for serving -// queries with a given edit distance. -func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string, - fuzziness uint8) (*DFA, error) { - return lab.pDfa.buildDfa(query, fuzziness, false) -} - -// MaxDistance returns the MaxEdit distance supported by the -// LevenshteinAutomatonBuilder builder. -func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 { - return lab.pDfa.maxDistance -} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go deleted file mode 100644 index bed9b99d56..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/levenshtein_nfa.go +++ /dev/null @@ -1,292 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package levenshtein2 - -import ( - "math" - "sort" -) - -/// Levenshtein Distance computed by a Levenshtein Automaton. -/// -/// Levenshtein automata can only compute the exact Levenshtein distance -/// up to a given `max_distance`. -/// -/// Over this distance, the automaton will invariably -/// return `Distance::AtLeast(max_distance + 1)`. -type Distance interface { - distance() uint8 -} - -type Exact struct { - d uint8 -} - -func (e Exact) distance() uint8 { - return e.d -} - -type Atleast struct { - d uint8 -} - -func (a Atleast) distance() uint8 { - return a.d -} - -func characteristicVector(query []rune, c rune) uint64 { - chi := uint64(0) - for i := 0; i < len(query); i++ { - if query[i] == c { - chi |= 1 << uint64(i) - } - } - return chi -} - -type NFAState struct { - Offset uint32 - Distance uint8 - InTranspose bool -} - -type NFAStates []NFAState - -func (ns NFAStates) Len() int { - return len(ns) -} - -func (ns NFAStates) Less(i, j int) bool { - if ns[i].Offset != ns[j].Offset { - return ns[i].Offset < ns[j].Offset - } - - if ns[i].Distance != ns[j].Distance { - return ns[i].Distance < ns[j].Distance - } - - return !ns[i].InTranspose && ns[j].InTranspose -} - -func (ns NFAStates) Swap(i, j int) { - ns[i], ns[j] = ns[j], ns[i] -} - -func (ns *NFAState) imply(other NFAState) bool { - transposeImply := ns.InTranspose - if !other.InTranspose { - transposeImply = !other.InTranspose - } - - deltaOffset := ns.Offset - other.Offset - if ns.Offset < other.Offset { - deltaOffset = other.Offset - ns.Offset - } - - if transposeImply { - return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset) - } - - return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset) -} - -type MultiState struct { - states []NFAState -} - -func (ms *MultiState) States() []NFAState { - return ms.states -} - -func (ms *MultiState) Clear() { - ms.states = ms.states[:0] -} - -func newMultiState() *MultiState { - return &MultiState{states: make([]NFAState, 0)} -} - -func (ms *MultiState) normalize() uint32 { - minOffset := uint32(math.MaxUint32) - - for _, s := range ms.states { - if s.Offset < minOffset { - minOffset = s.Offset - } - } - if minOffset == uint32(math.MaxUint32) { - minOffset = 0 - } - - for i := 0; i < len(ms.states); i++ { - ms.states[i].Offset -= minOffset - } - - sort.Sort(NFAStates(ms.states)) - - return minOffset -} - -func (ms *MultiState) addStates(nState NFAState) { - - for _, s := range ms.states { - if s.imply(nState) { - return - } - } - - i := 0 - for i < len(ms.states) { - if nState.imply(ms.states[i]) { - ms.states = append(ms.states[:i], ms.states[i+1:]...) - } else { - i++ - } - } - ms.states = append(ms.states, nState) - -} - -func extractBit(bitset uint64, pos uint8) bool { - shift := bitset >> pos - bit := shift & 1 - return bit == uint64(1) -} - -func dist(left, right uint32) uint32 { - if left > right { - return left - right - } - return right - left -} - -type LevenshteinNFA struct { - mDistance uint8 - damerau bool -} - -func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA { - return &LevenshteinNFA{mDistance: maxD, - damerau: transposition, - } -} - -func (la *LevenshteinNFA) maxDistance() uint8 { - return la.mDistance -} - -func (la *LevenshteinNFA) msDiameter() uint8 { - return 2*la.mDistance + 1 -} - -func (la *LevenshteinNFA) initialStates() *MultiState { - ms := MultiState{} - nfaState := NFAState{} - ms.addStates(nfaState) - return &ms -} - -func (la *LevenshteinNFA) multistateDistance(ms *MultiState, - queryLen uint32) Distance { - minDistance := Atleast{d: la.mDistance + 1} - for _, s := range ms.states { - t := s.Distance + uint8(dist(queryLen, s.Offset)) - if t <= uint8(la.mDistance) { - if minDistance.distance() > t { - minDistance.d = t - } - } - } - - if minDistance.distance() == la.mDistance+1 { - return Atleast{d: la.mDistance + 1} - } - - return minDistance -} - -func (la *LevenshteinNFA) simpleTransition(state NFAState, - symbol uint64, ms *MultiState) { - - if state.Distance < la.mDistance { - // insertion - ms.addStates(NFAState{Offset: state.Offset, - Distance: state.Distance + 1, - InTranspose: false}) - - // substitution - ms.addStates(NFAState{Offset: state.Offset + 1, - Distance: state.Distance + 1, - InTranspose: false}) - - n := la.mDistance + 1 - state.Distance - for d := uint8(1); d < n; d++ { - if extractBit(symbol, d) { - // for d > 0, as many deletion and character match - ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d), - Distance: state.Distance + d, - InTranspose: false}) - } - } - - if la.damerau && extractBit(symbol, 1) { - ms.addStates(NFAState{ - Offset: state.Offset, - Distance: state.Distance + 1, - InTranspose: true}) - } - - } - - if extractBit(symbol, 0) { - ms.addStates(NFAState{Offset: state.Offset + 1, - Distance: state.Distance, - InTranspose: false}) - } - - if state.InTranspose && extractBit(symbol, 0) { - ms.addStates(NFAState{Offset: state.Offset + 2, - Distance: state.Distance, - InTranspose: false}) - } - -} - -func (la *LevenshteinNFA) transition(cState *MultiState, - dState *MultiState, scv uint64) { - dState.Clear() - mask := (uint64(1) << la.msDiameter()) - uint64(1) - - for _, state := range cState.states { - cv := (scv >> state.Offset) & mask - la.simpleTransition(state, cv, dState) - } - - sort.Sort(NFAStates(dState.states)) -} - -func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance { - cState := la.initialStates() - nState := newMultiState() - - for _, i := range other { - nState.Clear() - chi := characteristicVector(query, i) - la.transition(cState, nState, chi) - cState, nState = nState, cState - } - - return la.multistateDistance(cState, uint32(len(query))) -} diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go b/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go deleted file mode 100644 index ebd9311959..0000000000 --- a/vendor/github.com/couchbase/vellum/levenshtein2/parametric_dfa.go +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package levenshtein2 - -import ( - "crypto/md5" - "encoding/json" - "fmt" - "math" -) - -type ParametricState struct { - shapeID uint32 - offset uint32 -} - -func newParametricState() ParametricState { - return ParametricState{} -} - -func (ps *ParametricState) isDeadEnd() bool { - return ps.shapeID == 0 -} - -type Transition struct { - destShapeID uint32 - deltaOffset uint32 -} - -func (t *Transition) apply(state ParametricState) ParametricState { - ps := ParametricState{ - shapeID: t.destShapeID} - // don't need any offset if we are in the dead state, - // this ensures we have only one dead state. - if t.destShapeID != 0 { - ps.offset = state.offset + t.deltaOffset - } - - return ps -} - -type ParametricStateIndex struct { - stateIndex []uint32 - stateQueue []ParametricState - numOffsets uint32 -} - -func newParametricStateIndex(queryLen, - numParamState uint32) ParametricStateIndex { - numOffsets := queryLen + 1 - if numParamState == 0 { - numParamState = numOffsets - } - maxNumStates := numParamState * numOffsets - psi := ParametricStateIndex{ - stateIndex: make([]uint32, maxNumStates), - stateQueue: make([]ParametricState, 0, 150), - numOffsets: numOffsets, - } - - for i := uint32(0); i < maxNumStates; i++ { - psi.stateIndex[i] = math.MaxUint32 - } - return psi -} - -func (psi *ParametricStateIndex) numStates() int { - return len(psi.stateQueue) -} - -func (psi *ParametricStateIndex) maxNumStates() int { - return len(psi.stateIndex) -} - -func (psi *ParametricStateIndex) get(stateID uint32) ParametricState { - return psi.stateQueue[stateID] -} - -func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 { - bucket := ps.shapeID*psi.numOffsets + ps.offset - if bucket < uint32(len(psi.stateIndex)) && - psi.stateIndex[bucket] != math.MaxUint32 { - return psi.stateIndex[bucket] - } - nState := uint32(len(psi.stateQueue)) - psi.stateQueue = append(psi.stateQueue, ps) - - psi.stateIndex[bucket] = nState - return nState -} - -type ParametricDFA struct { - distance []uint8 - transitions []Transition - maxDistance uint8 - transitionStride uint32 - diameter uint32 -} - -func (pdfa *ParametricDFA) initialState() ParametricState { - return ParametricState{shapeID: 1} -} - -// Returns true iff whatever characters come afterward, -// we will never reach a shorter distance -func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool { - if state.isDeadEnd() { - return true - } - - remOffset := queryLen - state.offset - if remOffset < pdfa.diameter { - stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:] - prefixDistance := stateDistances[remOffset] - if prefixDistance > pdfa.maxDistance { - return false - } - - for _, d := range stateDistances { - if d < prefixDistance { - return false - } - } - return true - } - return false -} - -func (pdfa *ParametricDFA) numStates() int { - return len(pdfa.transitions) / int(pdfa.transitionStride) -} - -func min(x, y uint32) uint32 { - if x < y { - return x - } - return y -} - -func (pdfa *ParametricDFA) transition(state ParametricState, - chi uint32) Transition { - return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi] -} - -func (pdfa *ParametricDFA) getDistance(state ParametricState, - qLen uint32) Distance { - remainingOffset := qLen - state.offset - if state.isDeadEnd() || remainingOffset >= pdfa.diameter { - return Atleast{d: pdfa.maxDistance + 1} - } - dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)] - if dist > pdfa.maxDistance { - return Atleast{d: dist} - } - return Exact{d: dist} -} - -func (pdfa *ParametricDFA) computeDistance(left, right string) Distance { - state := pdfa.initialState() - leftChars := []rune(left) - for _, chr := range []rune(right) { - start := state.offset - stop := min(start+pdfa.diameter, uint32(len(leftChars))) - chi := characteristicVector(leftChars[start:stop], chr) - transition := pdfa.transition(state, uint32(chi)) - state = transition.apply(state) - if state.isDeadEnd() { - return Atleast{d: pdfa.maxDistance + 1} - } - } - return pdfa.getDistance(state, uint32(len(left))) -} - -func (pdfa *ParametricDFA) buildDfa(query string, distance uint8, - prefix bool) (*DFA, error) { - qLen := uint32(len([]rune(query))) - alphabet := queryChars(query) - - psi := newParametricStateIndex(qLen, uint32(pdfa.numStates())) - maxNumStates := psi.maxNumStates() - deadEndStateID := psi.getOrAllocate(newParametricState()) - if deadEndStateID != 0 { - return nil, fmt.Errorf("Invalid dead end state") - } - - initialStateID := psi.getOrAllocate(pdfa.initialState()) - dfaBuilder := withMaxStates(uint32(maxNumStates)) - mask := uint32((1 << pdfa.diameter) - 1) - - var stateID int - for stateID = 0; stateID < StateLimit; stateID++ { - if stateID == psi.numStates() { - break - } - state := psi.get(uint32(stateID)) - if prefix && pdfa.isPrefixSink(state, qLen) { - distance := pdfa.getDistance(state, qLen) - dfaBuilder.addState(uint32(stateID), uint32(stateID), distance) - } else { - transition := pdfa.transition(state, 0) - defSuccessor := transition.apply(state) - defSuccessorID := psi.getOrAllocate(defSuccessor) - distance := pdfa.getDistance(state, qLen) - stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance) - - if err != nil { - return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err) - } - - alphabet.resetNext() - chr, cv, err := alphabet.next() - for err == nil { - chi := cv.shiftAndMask(state.offset, mask) - - transition := pdfa.transition(state, chi) - - destState := transition.apply(state) - - destStateID := psi.getOrAllocate(destState) - - stateBuilder.addTransition(chr, destStateID) - - chr, cv, err = alphabet.next() - } - } - } - - if stateID == StateLimit { - return nil, ErrTooManyStates - } - - dfaBuilder.setInitialState(initialStateID) - return dfaBuilder.build(distance), nil -} - -func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { - lookUp := newHash() - lookUp.getOrAllocate(*newMultiState()) - initialState := nfa.initialStates() - lookUp.getOrAllocate(*initialState) - - maxDistance := nfa.maxDistance() - msDiameter := nfa.msDiameter() - - numChi := 1 << msDiameter - chiValues := make([]uint64, numChi) - for i := 0; i < numChi; i++ { - chiValues[i] = uint64(i) - } - - transitions := make([]Transition, 0, numChi*int(msDiameter)) - var stateID int - for stateID = 0; stateID < StateLimit; stateID++ { - if stateID == len(lookUp.items) { - break - } - - for _, chi := range chiValues { - destMs := newMultiState() - - ms := lookUp.getFromID(stateID) - - nfa.transition(ms, destMs, chi) - - translation := destMs.normalize() - - destID := lookUp.getOrAllocate(*destMs) - - transitions = append(transitions, Transition{ - destShapeID: uint32(destID), - deltaOffset: translation, - }) - } - } - - if stateID == StateLimit { - return nil, ErrTooManyStates - } - - ns := len(lookUp.items) - diameter := int(msDiameter) - - distances := make([]uint8, 0, diameter*ns) - for stateID := 0; stateID < ns; stateID++ { - ms := lookUp.getFromID(stateID) - for offset := 0; offset < diameter; offset++ { - dist := nfa.multistateDistance(ms, uint32(offset)) - distances = append(distances, dist.distance()) - } - } - - return &ParametricDFA{ - diameter: uint32(msDiameter), - transitions: transitions, - maxDistance: maxDistance, - transitionStride: uint32(numChi), - distance: distances, - }, nil -} - -type hash struct { - index map[[16]byte]int - items []MultiState -} - -func newHash() *hash { - return &hash{ - index: make(map[[16]byte]int, 100), - items: make([]MultiState, 0, 100), - } -} - -func (h *hash) getOrAllocate(m MultiState) int { - size := len(h.items) - var exists bool - var pos int - md5 := getHash(&m) - if pos, exists = h.index[md5]; !exists { - h.index[md5] = size - pos = size - h.items = append(h.items, m) - } - return pos -} - -func (h *hash) getFromID(id int) *MultiState { - return &h.items[id] -} - -func getHash(ms *MultiState) [16]byte { - msBytes := []byte{} - for _, state := range ms.states { - jsonBytes, _ := json.Marshal(&state) - msBytes = append(msBytes, jsonBytes...) - } - return md5.Sum(msBytes) -} diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go index 55280164c7..92284d0a87 100644 --- a/vendor/github.com/couchbase/vellum/regexp/compile.go +++ b/vendor/github.com/couchbase/vellum/regexp/compile.go @@ -75,15 +75,23 @@ func (c *compiler) c(ast *syntax.Regexp) (err error) { Rune0: [2]rune{r, r}, } next.Rune = next.Rune0[0:2] - return c.c(&next) - } - c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( - r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes) - if err != nil { - return err - } - for _, seq := range c.sequences { - c.compileUtf8Ranges(seq) + // try to find more folded runes + for r1 := unicode.SimpleFold(r); r1 != r; r1 = unicode.SimpleFold(r1) { + next.Rune = append(next.Rune, r1, r1) + } + err = c.c(&next) + if err != nil { + return err + } + } else { + c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc( + r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes) + if err != nil { + return err + } + for _, seq := range c.sequences { + c.compileUtf8Ranges(seq) + } } } case syntax.OpAnyChar: -- cgit v1.2.3