123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- // Copyright (c) 2018 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package levenshtein
-
- import (
- "fmt"
- "math"
- )
-
- const SinkState = uint32(0)
-
- type DFA struct {
- transitions [][256]uint32
- distances []Distance
- initState int
- ed uint8
- }
-
- /// Returns the initial state
- func (d *DFA) initialState() int {
- return d.initState
- }
-
- /// Returns the Levenshtein distance associated to the
- /// current state.
- func (d *DFA) distance(stateId int) Distance {
- return d.distances[stateId]
- }
-
- /// Returns the number of states in the `DFA`.
- func (d *DFA) numStates() int {
- return len(d.transitions)
- }
-
- /// Returns the destination state reached after consuming a given byte.
- func (d *DFA) transition(fromState int, b uint8) int {
- return int(d.transitions[fromState][b])
- }
-
- func (d *DFA) eval(bytes []uint8) Distance {
- state := d.initialState()
-
- for _, b := range bytes {
- state = d.transition(state, b)
- }
-
- return d.distance(state)
- }
-
- func (d *DFA) Start() int {
- return int(d.initialState())
- }
-
- func (d *DFA) IsMatch(state int) bool {
- if _, ok := d.distance(state).(Exact); ok {
- return true
- }
- return false
- }
-
- func (d *DFA) CanMatch(state int) bool {
- return state > 0 && state < d.numStates()
- }
-
- func (d *DFA) Accept(state int, b byte) int {
- return int(d.transition(state, b))
- }
-
- // WillAlwaysMatch returns if the specified state will always end in a
- // matching state.
- func (d *DFA) WillAlwaysMatch(state int) bool {
- return false
- }
-
- func fill(dest []uint32, val uint32) {
- for i := range dest {
- dest[i] = val
- }
- }
-
- func fillTransitions(dest *[256]uint32, val uint32) {
- for i := range dest {
- dest[i] = val
- }
- }
-
- type Utf8DFAStateBuilder struct {
- dfaBuilder *Utf8DFABuilder
- stateID uint32
- defaultSuccessor []uint32
- }
-
- func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
- toStateID uint32) {
- sb.dfaBuilder.transitions[fromStateID][b] = toStateID
- }
-
- func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
- fromStateID := sb.stateID
- chars := []byte(string(in))
- lastByte := chars[len(chars)-1]
-
- for i, ch := range chars[:len(chars)-1] {
- remNumBytes := len(chars) - i - 1
- defaultSuccessor := sb.defaultSuccessor[remNumBytes]
- intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
-
- if intermediateStateID == defaultSuccessor {
- intermediateStateID = sb.dfaBuilder.allocate()
- fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
- sb.defaultSuccessor[remNumBytes-1])
- }
-
- sb.addTransitionID(fromStateID, ch, intermediateStateID)
- fromStateID = intermediateStateID
- }
-
- toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
- sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
- }
-
- type Utf8StateId uint32
-
- func original(stateId uint32) Utf8StateId {
- return predecessor(stateId, 0)
- }
-
- func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
- return Utf8StateId(stateId*4 + uint32(numSteps))
- }
-
- // Utf8DFABuilder makes it possible to define a DFA
- // that takes unicode character, and build a `DFA`
- // that operates on utf-8 encoded
- type Utf8DFABuilder struct {
- index []uint32
- distances []Distance
- transitions [][256]uint32
- initialState uint32
- numStates uint32
- maxNumStates uint32
- }
-
- func withMaxStates(maxStates uint32) *Utf8DFABuilder {
- rv := &Utf8DFABuilder{
- index: make([]uint32, maxStates*2+100),
- distances: make([]Distance, 0, maxStates),
- transitions: make([][256]uint32, 0, maxStates),
- maxNumStates: maxStates,
- }
-
- for i := range rv.index {
- rv.index[i] = math.MaxUint32
- }
-
- return rv
- }
-
- func (dfab *Utf8DFABuilder) allocate() uint32 {
- newState := dfab.numStates
- dfab.numStates++
-
- dfab.distances = append(dfab.distances, Atleast{d: 255})
- dfab.transitions = append(dfab.transitions, [256]uint32{})
-
- return newState
- }
-
- func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
- if int(state) >= cap(dfab.index) {
- cloneIndex := make([]uint32, int(state)*2)
- copy(cloneIndex, dfab.index)
- dfab.index = cloneIndex
- }
- if dfab.index[state] != math.MaxUint32 {
- return dfab.index[state]
- }
-
- nstate := dfab.allocate()
- dfab.index[state] = nstate
-
- return nstate
- }
-
- func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
- decodedID := dfab.getOrAllocate(original(iState))
- dfab.initialState = decodedID
- }
-
- func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
- return &DFA{
- transitions: dfab.transitions,
- distances: dfab.distances,
- initState: int(dfab.initialState),
- ed: ed,
- }
- }
-
- func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
- distance Distance) (*Utf8DFAStateBuilder, error) {
- if state > dfab.maxNumStates {
- return nil, fmt.Errorf("State id is larger than maxNumStates")
- }
-
- stateID := dfab.getOrAllocate(original(state))
- dfab.distances[stateID] = distance
-
- defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
- // creates a chain of states of predecessors of `default_suc_orig`.
- // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
- // leads to the `default_suc_orig` state.
- predecessorStates := []uint32{defaultSuccID,
- defaultSuccID,
- defaultSuccID,
- defaultSuccID}
-
- for numBytes := uint8(1); numBytes < 4; numBytes++ {
- predecessorState := predecessor(default_suc_orig, numBytes)
- predecessorStateID := dfab.getOrAllocate(predecessorState)
- predecessorStates[numBytes] = predecessorStateID
- succ := predecessorStates[numBytes-1]
- fillTransitions(&dfab.transitions[predecessorStateID], succ)
- }
-
- // 1-byte encoded chars.
- fill(dfab.transitions[stateID][0:192], predecessorStates[0])
- // 2-bytes encoded chars.
- fill(dfab.transitions[stateID][192:224], predecessorStates[1])
- // 3-bytes encoded chars.
- fill(dfab.transitions[stateID][224:240], predecessorStates[2])
- // 4-bytes encoded chars.
- fill(dfab.transitions[stateID][240:256], predecessorStates[3])
-
- return &Utf8DFAStateBuilder{
- dfaBuilder: dfab,
- stateID: stateID,
- defaultSuccessor: predecessorStates}, nil
- }
|