You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dfa.go 4.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package regexp
  15. import (
  16. "encoding/binary"
  17. "fmt"
  18. )
  19. // StateLimit is the maximum number of states allowed
  20. const StateLimit = 10000
  21. // ErrTooManyStates is returned if you attempt to build a Levenshtein
  22. // automaton which requires too many states.
  23. var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
  24. StateLimit)
  25. type dfaBuilder struct {
  26. dfa *dfa
  27. cache map[string]int
  28. keyBuf []byte
  29. }
  30. func newDfaBuilder(insts prog) *dfaBuilder {
  31. d := &dfaBuilder{
  32. dfa: &dfa{
  33. insts: insts,
  34. states: make([]state, 0, 16),
  35. },
  36. cache: make(map[string]int, 1024),
  37. }
  38. // add 0 state that is invalid
  39. d.dfa.states = append(d.dfa.states, state{
  40. next: make([]int, 256),
  41. match: false,
  42. })
  43. return d
  44. }
  45. func (d *dfaBuilder) build() (*dfa, error) {
  46. cur := newSparseSet(uint(len(d.dfa.insts)))
  47. next := newSparseSet(uint(len(d.dfa.insts)))
  48. d.dfa.add(cur, 0)
  49. ns, instsReuse := d.cachedState(cur, nil)
  50. states := intStack{ns}
  51. seen := make(map[int]struct{})
  52. var s int
  53. states, s = states.Pop()
  54. for s != 0 {
  55. for b := 0; b < 256; b++ {
  56. var ns int
  57. ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse)
  58. if ns != 0 {
  59. if _, ok := seen[ns]; !ok {
  60. seen[ns] = struct{}{}
  61. states = states.Push(ns)
  62. }
  63. }
  64. if len(d.dfa.states) > StateLimit {
  65. return nil, ErrTooManyStates
  66. }
  67. }
  68. states, s = states.Pop()
  69. }
  70. return d.dfa, nil
  71. }
  72. func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) (
  73. int, []uint) {
  74. cur.Clear()
  75. for _, ip := range d.dfa.states[state].insts {
  76. cur.Add(ip)
  77. }
  78. d.dfa.run(cur, next, b)
  79. var nextState int
  80. nextState, instsReuse = d.cachedState(next, instsReuse)
  81. d.dfa.states[state].next[b] = nextState
  82. return nextState, instsReuse
  83. }
  84. func instsKey(insts []uint, buf []byte) []byte {
  85. if cap(buf) < 8*len(insts) {
  86. buf = make([]byte, 8*len(insts))
  87. } else {
  88. buf = buf[0 : 8*len(insts)]
  89. }
  90. for i, inst := range insts {
  91. binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
  92. }
  93. return buf
  94. }
  95. func (d *dfaBuilder) cachedState(set *sparseSet,
  96. instsReuse []uint) (int, []uint) {
  97. insts := instsReuse[:0]
  98. if cap(insts) == 0 {
  99. insts = make([]uint, 0, set.Len())
  100. }
  101. var isMatch bool
  102. for i := uint(0); i < uint(set.Len()); i++ {
  103. ip := set.Get(i)
  104. switch d.dfa.insts[ip].op {
  105. case OpRange:
  106. insts = append(insts, ip)
  107. case OpMatch:
  108. isMatch = true
  109. insts = append(insts, ip)
  110. }
  111. }
  112. if len(insts) == 0 {
  113. return 0, insts
  114. }
  115. d.keyBuf = instsKey(insts, d.keyBuf)
  116. v, ok := d.cache[string(d.keyBuf)]
  117. if ok {
  118. return v, insts
  119. }
  120. d.dfa.states = append(d.dfa.states, state{
  121. insts: insts,
  122. next: make([]int, 256),
  123. match: isMatch,
  124. })
  125. newV := len(d.dfa.states) - 1
  126. d.cache[string(d.keyBuf)] = newV
  127. return newV, nil
  128. }
  129. type dfa struct {
  130. insts prog
  131. states []state
  132. }
  133. func (d *dfa) add(set *sparseSet, ip uint) {
  134. if set.Contains(ip) {
  135. return
  136. }
  137. set.Add(ip)
  138. switch d.insts[ip].op {
  139. case OpJmp:
  140. d.add(set, d.insts[ip].to)
  141. case OpSplit:
  142. d.add(set, d.insts[ip].splitA)
  143. d.add(set, d.insts[ip].splitB)
  144. }
  145. }
  146. func (d *dfa) run(from, to *sparseSet, b byte) bool {
  147. to.Clear()
  148. var isMatch bool
  149. for i := uint(0); i < uint(from.Len()); i++ {
  150. ip := from.Get(i)
  151. switch d.insts[ip].op {
  152. case OpMatch:
  153. isMatch = true
  154. case OpRange:
  155. if d.insts[ip].rangeStart <= b &&
  156. b <= d.insts[ip].rangeEnd {
  157. d.add(to, ip+1)
  158. }
  159. }
  160. }
  161. return isMatch
  162. }
  163. type state struct {
  164. insts []uint
  165. next []int
  166. match bool
  167. }
  168. type intStack []int
  169. func (s intStack) Push(v int) intStack {
  170. return append(s, v)
  171. }
  172. func (s intStack) Pop() (intStack, int) {
  173. l := len(s)
  174. if l < 1 {
  175. return s, 0
  176. }
  177. return s[:l-1], s[l-1]
  178. }