You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

parametric_dfa.go 8.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. // Copyright (c) 2018 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package levenshtein
  15. import (
  16. "crypto/md5"
  17. "encoding/json"
  18. "fmt"
  19. "math"
  20. )
  21. type ParametricState struct {
  22. shapeID uint32
  23. offset uint32
  24. }
  25. func newParametricState() ParametricState {
  26. return ParametricState{}
  27. }
  28. func (ps *ParametricState) isDeadEnd() bool {
  29. return ps.shapeID == 0
  30. }
  31. type Transition struct {
  32. destShapeID uint32
  33. deltaOffset uint32
  34. }
  35. func (t *Transition) apply(state ParametricState) ParametricState {
  36. ps := ParametricState{
  37. shapeID: t.destShapeID}
  38. // don't need any offset if we are in the dead state,
  39. // this ensures we have only one dead state.
  40. if t.destShapeID != 0 {
  41. ps.offset = state.offset + t.deltaOffset
  42. }
  43. return ps
  44. }
  45. type ParametricStateIndex struct {
  46. stateIndex []uint32
  47. stateQueue []ParametricState
  48. numOffsets uint32
  49. }
  50. func newParametricStateIndex(queryLen,
  51. numParamState uint32) ParametricStateIndex {
  52. numOffsets := queryLen + 1
  53. if numParamState == 0 {
  54. numParamState = numOffsets
  55. }
  56. maxNumStates := numParamState * numOffsets
  57. psi := ParametricStateIndex{
  58. stateIndex: make([]uint32, maxNumStates),
  59. stateQueue: make([]ParametricState, 0, 150),
  60. numOffsets: numOffsets,
  61. }
  62. for i := uint32(0); i < maxNumStates; i++ {
  63. psi.stateIndex[i] = math.MaxUint32
  64. }
  65. return psi
  66. }
  67. func (psi *ParametricStateIndex) numStates() int {
  68. return len(psi.stateQueue)
  69. }
  70. func (psi *ParametricStateIndex) maxNumStates() int {
  71. return len(psi.stateIndex)
  72. }
  73. func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
  74. return psi.stateQueue[stateID]
  75. }
  76. func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
  77. bucket := ps.shapeID*psi.numOffsets + ps.offset
  78. if bucket < uint32(len(psi.stateIndex)) &&
  79. psi.stateIndex[bucket] != math.MaxUint32 {
  80. return psi.stateIndex[bucket]
  81. }
  82. nState := uint32(len(psi.stateQueue))
  83. psi.stateQueue = append(psi.stateQueue, ps)
  84. psi.stateIndex[bucket] = nState
  85. return nState
  86. }
  87. type ParametricDFA struct {
  88. distance []uint8
  89. transitions []Transition
  90. maxDistance uint8
  91. transitionStride uint32
  92. diameter uint32
  93. }
  94. func (pdfa *ParametricDFA) initialState() ParametricState {
  95. return ParametricState{shapeID: 1}
  96. }
  97. // Returns true iff whatever characters come afterward,
  98. // we will never reach a shorter distance
  99. func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
  100. if state.isDeadEnd() {
  101. return true
  102. }
  103. remOffset := queryLen - state.offset
  104. if remOffset < pdfa.diameter {
  105. stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
  106. prefixDistance := stateDistances[remOffset]
  107. if prefixDistance > pdfa.maxDistance {
  108. return false
  109. }
  110. for _, d := range stateDistances {
  111. if d < prefixDistance {
  112. return false
  113. }
  114. }
  115. return true
  116. }
  117. return false
  118. }
  119. func (pdfa *ParametricDFA) numStates() int {
  120. return len(pdfa.transitions) / int(pdfa.transitionStride)
  121. }
  122. func min(x, y uint32) uint32 {
  123. if x < y {
  124. return x
  125. }
  126. return y
  127. }
  128. func (pdfa *ParametricDFA) transition(state ParametricState,
  129. chi uint32) Transition {
  130. return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
  131. }
  132. func (pdfa *ParametricDFA) getDistance(state ParametricState,
  133. qLen uint32) Distance {
  134. remainingOffset := qLen - state.offset
  135. if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
  136. return Atleast{d: pdfa.maxDistance + 1}
  137. }
  138. dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
  139. if dist > pdfa.maxDistance {
  140. return Atleast{d: dist}
  141. }
  142. return Exact{d: dist}
  143. }
  144. func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
  145. state := pdfa.initialState()
  146. leftChars := []rune(left)
  147. for _, chr := range []rune(right) {
  148. start := state.offset
  149. stop := min(start+pdfa.diameter, uint32(len(leftChars)))
  150. chi := characteristicVector(leftChars[start:stop], chr)
  151. transition := pdfa.transition(state, uint32(chi))
  152. state = transition.apply(state)
  153. if state.isDeadEnd() {
  154. return Atleast{d: pdfa.maxDistance + 1}
  155. }
  156. }
  157. return pdfa.getDistance(state, uint32(len(left)))
  158. }
  159. func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
  160. prefix bool) (*DFA, error) {
  161. qLen := uint32(len([]rune(query)))
  162. alphabet := queryChars(query)
  163. psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
  164. maxNumStates := psi.maxNumStates()
  165. deadEndStateID := psi.getOrAllocate(newParametricState())
  166. if deadEndStateID != 0 {
  167. return nil, fmt.Errorf("Invalid dead end state")
  168. }
  169. initialStateID := psi.getOrAllocate(pdfa.initialState())
  170. dfaBuilder := withMaxStates(uint32(maxNumStates))
  171. mask := uint32((1 << pdfa.diameter) - 1)
  172. var stateID int
  173. for stateID = 0; stateID < StateLimit; stateID++ {
  174. if stateID == psi.numStates() {
  175. break
  176. }
  177. state := psi.get(uint32(stateID))
  178. if prefix && pdfa.isPrefixSink(state, qLen) {
  179. distance := pdfa.getDistance(state, qLen)
  180. dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
  181. } else {
  182. transition := pdfa.transition(state, 0)
  183. defSuccessor := transition.apply(state)
  184. defSuccessorID := psi.getOrAllocate(defSuccessor)
  185. distance := pdfa.getDistance(state, qLen)
  186. stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
  187. if err != nil {
  188. return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
  189. }
  190. alphabet.resetNext()
  191. chr, cv, err := alphabet.next()
  192. for err == nil {
  193. chi := cv.shiftAndMask(state.offset, mask)
  194. transition := pdfa.transition(state, chi)
  195. destState := transition.apply(state)
  196. destStateID := psi.getOrAllocate(destState)
  197. stateBuilder.addTransition(chr, destStateID)
  198. chr, cv, err = alphabet.next()
  199. }
  200. }
  201. }
  202. if stateID == StateLimit {
  203. return nil, ErrTooManyStates
  204. }
  205. dfaBuilder.setInitialState(initialStateID)
  206. return dfaBuilder.build(distance), nil
  207. }
  208. func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
  209. lookUp := newHash()
  210. lookUp.getOrAllocate(*newMultiState())
  211. initialState := nfa.initialStates()
  212. lookUp.getOrAllocate(*initialState)
  213. maxDistance := nfa.maxDistance()
  214. msDiameter := nfa.msDiameter()
  215. numChi := 1 << msDiameter
  216. chiValues := make([]uint64, numChi)
  217. for i := 0; i < numChi; i++ {
  218. chiValues[i] = uint64(i)
  219. }
  220. transitions := make([]Transition, 0, numChi*int(msDiameter))
  221. var stateID int
  222. for stateID = 0; stateID < StateLimit; stateID++ {
  223. if stateID == len(lookUp.items) {
  224. break
  225. }
  226. for _, chi := range chiValues {
  227. destMs := newMultiState()
  228. ms := lookUp.getFromID(stateID)
  229. nfa.transition(ms, destMs, chi)
  230. translation := destMs.normalize()
  231. destID := lookUp.getOrAllocate(*destMs)
  232. transitions = append(transitions, Transition{
  233. destShapeID: uint32(destID),
  234. deltaOffset: translation,
  235. })
  236. }
  237. }
  238. if stateID == StateLimit {
  239. return nil, ErrTooManyStates
  240. }
  241. ns := len(lookUp.items)
  242. diameter := int(msDiameter)
  243. distances := make([]uint8, 0, diameter*ns)
  244. for stateID := 0; stateID < ns; stateID++ {
  245. ms := lookUp.getFromID(stateID)
  246. for offset := 0; offset < diameter; offset++ {
  247. dist := nfa.multistateDistance(ms, uint32(offset))
  248. distances = append(distances, dist.distance())
  249. }
  250. }
  251. return &ParametricDFA{
  252. diameter: uint32(msDiameter),
  253. transitions: transitions,
  254. maxDistance: maxDistance,
  255. transitionStride: uint32(numChi),
  256. distance: distances,
  257. }, nil
  258. }
  259. type hash struct {
  260. index map[[16]byte]int
  261. items []MultiState
  262. }
  263. func newHash() *hash {
  264. return &hash{
  265. index: make(map[[16]byte]int, 100),
  266. items: make([]MultiState, 0, 100),
  267. }
  268. }
  269. func (h *hash) getOrAllocate(m MultiState) int {
  270. size := len(h.items)
  271. var exists bool
  272. var pos int
  273. md5 := getHash(&m)
  274. if pos, exists = h.index[md5]; !exists {
  275. h.index[md5] = size
  276. pos = size
  277. h.items = append(h.items, m)
  278. }
  279. return pos
  280. }
  281. func (h *hash) getFromID(id int) *MultiState {
  282. return &h.items[id]
  283. }
  284. func getHash(ms *MultiState) [16]byte {
  285. msBytes := []byte{}
  286. for _, state := range ms.states {
  287. jsonBytes, _ := json.Marshal(&state)
  288. msBytes = append(msBytes, jsonBytes...)
  289. }
  290. return md5.Sum(msBytes)
  291. }