You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dfa.go 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package levenshtein
  15. import (
  16. "fmt"
  17. "math"
  18. )
  19. const SinkState = uint32(0)
  20. type DFA struct {
  21. transitions [][256]uint32
  22. distances []Distance
  23. initState int
  24. ed uint8
  25. }
  26. /// Returns the initial state
  27. func (d *DFA) initialState() int {
  28. return d.initState
  29. }
  30. /// Returns the Levenshtein distance associated to the
  31. /// current state.
  32. func (d *DFA) distance(stateId int) Distance {
  33. return d.distances[stateId]
  34. }
  35. /// Returns the number of states in the `DFA`.
  36. func (d *DFA) numStates() int {
  37. return len(d.transitions)
  38. }
  39. /// Returns the destination state reached after consuming a given byte.
  40. func (d *DFA) transition(fromState int, b uint8) int {
  41. return int(d.transitions[fromState][b])
  42. }
  43. func (d *DFA) eval(bytes []uint8) Distance {
  44. state := d.initialState()
  45. for _, b := range bytes {
  46. state = d.transition(state, b)
  47. }
  48. return d.distance(state)
  49. }
  50. func (d *DFA) Start() int {
  51. return int(d.initialState())
  52. }
  53. func (d *DFA) IsMatch(state int) bool {
  54. if _, ok := d.distance(state).(Exact); ok {
  55. return true
  56. }
  57. return false
  58. }
  59. func (d *DFA) CanMatch(state int) bool {
  60. return state > 0 && state < d.numStates()
  61. }
  62. func (d *DFA) Accept(state int, b byte) int {
  63. return int(d.transition(state, b))
  64. }
  65. // WillAlwaysMatch returns if the specified state will always end in a
  66. // matching state.
  67. func (d *DFA) WillAlwaysMatch(state int) bool {
  68. return false
  69. }
  70. func fill(dest []uint32, val uint32) {
  71. for i := range dest {
  72. dest[i] = val
  73. }
  74. }
  75. func fillTransitions(dest *[256]uint32, val uint32) {
  76. for i := range dest {
  77. dest[i] = val
  78. }
  79. }
  80. type Utf8DFAStateBuilder struct {
  81. dfaBuilder *Utf8DFABuilder
  82. stateID uint32
  83. defaultSuccessor []uint32
  84. }
  85. func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
  86. toStateID uint32) {
  87. sb.dfaBuilder.transitions[fromStateID][b] = toStateID
  88. }
  89. func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
  90. fromStateID := sb.stateID
  91. chars := []byte(string(in))
  92. lastByte := chars[len(chars)-1]
  93. for i, ch := range chars[:len(chars)-1] {
  94. remNumBytes := len(chars) - i - 1
  95. defaultSuccessor := sb.defaultSuccessor[remNumBytes]
  96. intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
  97. if intermediateStateID == defaultSuccessor {
  98. intermediateStateID = sb.dfaBuilder.allocate()
  99. fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
  100. sb.defaultSuccessor[remNumBytes-1])
  101. }
  102. sb.addTransitionID(fromStateID, ch, intermediateStateID)
  103. fromStateID = intermediateStateID
  104. }
  105. toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
  106. sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
  107. }
  108. type Utf8StateId uint32
  109. func original(stateId uint32) Utf8StateId {
  110. return predecessor(stateId, 0)
  111. }
  112. func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
  113. return Utf8StateId(stateId*4 + uint32(numSteps))
  114. }
  115. // Utf8DFABuilder makes it possible to define a DFA
  116. // that takes unicode character, and build a `DFA`
  117. // that operates on utf-8 encoded
  118. type Utf8DFABuilder struct {
  119. index []uint32
  120. distances []Distance
  121. transitions [][256]uint32
  122. initialState uint32
  123. numStates uint32
  124. maxNumStates uint32
  125. }
  126. func withMaxStates(maxStates uint32) *Utf8DFABuilder {
  127. rv := &Utf8DFABuilder{
  128. index: make([]uint32, maxStates*2+100),
  129. distances: make([]Distance, 0, maxStates),
  130. transitions: make([][256]uint32, 0, maxStates),
  131. maxNumStates: maxStates,
  132. }
  133. for i := range rv.index {
  134. rv.index[i] = math.MaxUint32
  135. }
  136. return rv
  137. }
  138. func (dfab *Utf8DFABuilder) allocate() uint32 {
  139. newState := dfab.numStates
  140. dfab.numStates++
  141. dfab.distances = append(dfab.distances, Atleast{d: 255})
  142. dfab.transitions = append(dfab.transitions, [256]uint32{})
  143. return newState
  144. }
  145. func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
  146. if int(state) >= cap(dfab.index) {
  147. cloneIndex := make([]uint32, int(state)*2)
  148. copy(cloneIndex, dfab.index)
  149. dfab.index = cloneIndex
  150. }
  151. if dfab.index[state] != math.MaxUint32 {
  152. return dfab.index[state]
  153. }
  154. nstate := dfab.allocate()
  155. dfab.index[state] = nstate
  156. return nstate
  157. }
  158. func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
  159. decodedID := dfab.getOrAllocate(original(iState))
  160. dfab.initialState = decodedID
  161. }
  162. func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
  163. return &DFA{
  164. transitions: dfab.transitions,
  165. distances: dfab.distances,
  166. initState: int(dfab.initialState),
  167. ed: ed,
  168. }
  169. }
  170. func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
  171. distance Distance) (*Utf8DFAStateBuilder, error) {
  172. if state > dfab.maxNumStates {
  173. return nil, fmt.Errorf("State id is larger than maxNumStates")
  174. }
  175. stateID := dfab.getOrAllocate(original(state))
  176. dfab.distances[stateID] = distance
  177. defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
  178. // creates a chain of states of predecessors of `default_suc_orig`.
  179. // Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
  180. // leads to the `default_suc_orig` state.
  181. predecessorStates := []uint32{defaultSuccID,
  182. defaultSuccID,
  183. defaultSuccID,
  184. defaultSuccID}
  185. for numBytes := uint8(1); numBytes < 4; numBytes++ {
  186. predecessorState := predecessor(default_suc_orig, numBytes)
  187. predecessorStateID := dfab.getOrAllocate(predecessorState)
  188. predecessorStates[numBytes] = predecessorStateID
  189. succ := predecessorStates[numBytes-1]
  190. fillTransitions(&dfab.transitions[predecessorStateID], succ)
  191. }
  192. // 1-byte encoded chars.
  193. fill(dfab.transitions[stateID][0:192], predecessorStates[0])
  194. // 2-bytes encoded chars.
  195. fill(dfab.transitions[stateID][192:224], predecessorStates[1])
  196. // 3-bytes encoded chars.
  197. fill(dfab.transitions[stateID][224:240], predecessorStates[2])
  198. // 4-bytes encoded chars.
  199. fill(dfab.transitions[stateID][240:256], predecessorStates[3])
  200. return &Utf8DFAStateBuilder{
  201. dfaBuilder: dfab,
  202. stateID: stateID,
  203. defaultSuccessor: predecessorStates}, nil
  204. }