diff options
Diffstat (limited to 'vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go')
-rw-r--r-- | vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go new file mode 100644 index 0000000000..4bf64fef2e --- /dev/null +++ b/vendor/github.com/couchbase/vellum/levenshtein2/alphabet.go @@ -0,0 +1,125 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levenshtein2 + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +type FullCharacteristicVector []uint32 + +func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 { + bucketID := offset / 32 + align := offset - bucketID*32 + if align == 0 { + return fcv[bucketID] & mask + } + left := fcv[bucketID] >> align + right := fcv[bucketID+1] << (32 - align) + return (left | right) & mask +} + +type tuple struct { + char rune + fcv FullCharacteristicVector +} + +type sortRunes []rune + +func (s sortRunes) Less(i, j int) bool { + return s[i] < s[j] +} + +func (s sortRunes) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortRunes) Len() int { + return len(s) +} + +func sortRune(r []rune) []rune { + sort.Sort(sortRunes(r)) + return r +} + +type Alphabet struct { + charset []tuple + index uint32 +} + +func (a *Alphabet) resetNext() { + a.index = 0 +} + +func (a *Alphabet) next() (rune, FullCharacteristicVector, error) { + if int(a.index) >= len(a.charset) { + return 0, nil, fmt.Errorf("eof") + } + + rv := a.charset[a.index] + a.index++ + return rv.char, rv.fcv, nil +} + +func dedupe(in string) string { + lookUp := make(map[rune]struct{}, len(in)) + var rv string + for len(in) > 0 { + r, size := utf8.DecodeRuneInString(in) + in = in[size:] + if _, ok := lookUp[r]; !ok { + rv += string(r) + lookUp[r] = struct{}{} + } + } + return rv +} + +func queryChars(qChars string) Alphabet { + chars := dedupe(qChars) + inChars := sortRune([]rune(chars)) + charsets := make([]tuple, 0, len(inChars)) + + for _, c := range inChars { + tempChars := qChars + var bits []uint32 + for len(tempChars) > 0 { + var chunk string + if len(tempChars) > 32 { + chunk = tempChars[0:32] + tempChars = tempChars[32:] + } else { + chunk = tempChars + tempChars = tempChars[:0] + } + + chunkBits := uint32(0) + bit := uint32(1) + for _, chr := range chunk { + if chr == c { + chunkBits |= bit + } + bit <<= 1 + } + bits = append(bits, chunkBits) + } + bits = append(bits, 0) + charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)}) + } + return Alphabet{charset: charsets} +} |