You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compile.go 7.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package regexp
  15. import (
  16. "regexp/syntax"
  17. "unicode"
  18. unicode_utf8 "unicode/utf8"
  19. "github.com/blevesearch/vellum/utf8"
  20. )
  21. type compiler struct {
  22. sizeLimit uint
  23. insts prog
  24. instsPool []inst
  25. sequences utf8.Sequences
  26. rangeStack utf8.RangeStack
  27. startBytes []byte
  28. endBytes []byte
  29. }
  30. func newCompiler(sizeLimit uint) *compiler {
  31. return &compiler{
  32. sizeLimit: sizeLimit,
  33. startBytes: make([]byte, unicode_utf8.UTFMax),
  34. endBytes: make([]byte, unicode_utf8.UTFMax),
  35. }
  36. }
  37. func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
  38. err := c.c(ast)
  39. if err != nil {
  40. return nil, err
  41. }
  42. inst := c.allocInst()
  43. inst.op = OpMatch
  44. c.insts = append(c.insts, inst)
  45. return c.insts, nil
  46. }
  47. func (c *compiler) c(ast *syntax.Regexp) (err error) {
  48. if ast.Flags&syntax.NonGreedy > 1 {
  49. return ErrNoLazy
  50. }
  51. switch ast.Op {
  52. case syntax.OpEndLine, syntax.OpBeginLine,
  53. syntax.OpBeginText, syntax.OpEndText:
  54. return ErrNoEmpty
  55. case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
  56. return ErrNoWordBoundary
  57. case syntax.OpEmptyMatch:
  58. return nil
  59. case syntax.OpLiteral:
  60. for _, r := range ast.Rune {
  61. if ast.Flags&syntax.FoldCase > 0 {
  62. next := syntax.Regexp{
  63. Op: syntax.OpCharClass,
  64. Flags: ast.Flags & syntax.FoldCase,
  65. Rune0: [2]rune{r, r},
  66. }
  67. next.Rune = next.Rune0[0:2]
  68. // try to find more folded runes
  69. for r1 := unicode.SimpleFold(r); r1 != r; r1 = unicode.SimpleFold(r1) {
  70. next.Rune = append(next.Rune, r1, r1)
  71. }
  72. err = c.c(&next)
  73. if err != nil {
  74. return err
  75. }
  76. } else {
  77. c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
  78. r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
  79. if err != nil {
  80. return err
  81. }
  82. for _, seq := range c.sequences {
  83. c.compileUtf8Ranges(seq)
  84. }
  85. }
  86. }
  87. case syntax.OpAnyChar:
  88. next := syntax.Regexp{
  89. Op: syntax.OpCharClass,
  90. Flags: ast.Flags & syntax.FoldCase,
  91. Rune0: [2]rune{0, unicode.MaxRune},
  92. }
  93. next.Rune = next.Rune0[:2]
  94. return c.c(&next)
  95. case syntax.OpAnyCharNotNL:
  96. next := syntax.Regexp{
  97. Op: syntax.OpCharClass,
  98. Flags: ast.Flags & syntax.FoldCase,
  99. Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
  100. }
  101. return c.c(&next)
  102. case syntax.OpCharClass:
  103. return c.compileClass(ast)
  104. case syntax.OpCapture:
  105. return c.c(ast.Sub[0])
  106. case syntax.OpConcat:
  107. for _, sub := range ast.Sub {
  108. err := c.c(sub)
  109. if err != nil {
  110. return err
  111. }
  112. }
  113. return nil
  114. case syntax.OpAlternate:
  115. if len(ast.Sub) == 0 {
  116. return nil
  117. }
  118. jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
  119. // does not handle last entry
  120. for i := 0; i < len(ast.Sub)-1; i++ {
  121. sub := ast.Sub[i]
  122. split := c.emptySplit()
  123. j1 := c.top()
  124. err := c.c(sub)
  125. if err != nil {
  126. return err
  127. }
  128. jmpsToEnd = append(jmpsToEnd, c.emptyJump())
  129. j2 := c.top()
  130. c.setSplit(split, j1, j2)
  131. }
  132. // handle last entry
  133. err := c.c(ast.Sub[len(ast.Sub)-1])
  134. if err != nil {
  135. return err
  136. }
  137. end := uint(len(c.insts))
  138. for _, jmpToEnd := range jmpsToEnd {
  139. c.setJump(jmpToEnd, end)
  140. }
  141. case syntax.OpQuest:
  142. split := c.emptySplit()
  143. j1 := c.top()
  144. err := c.c(ast.Sub[0])
  145. if err != nil {
  146. return err
  147. }
  148. j2 := c.top()
  149. c.setSplit(split, j1, j2)
  150. case syntax.OpStar:
  151. j1 := c.top()
  152. split := c.emptySplit()
  153. j2 := c.top()
  154. err := c.c(ast.Sub[0])
  155. if err != nil {
  156. return err
  157. }
  158. jmp := c.emptyJump()
  159. j3 := uint(len(c.insts))
  160. c.setJump(jmp, j1)
  161. c.setSplit(split, j2, j3)
  162. case syntax.OpPlus:
  163. j1 := c.top()
  164. err := c.c(ast.Sub[0])
  165. if err != nil {
  166. return err
  167. }
  168. split := c.emptySplit()
  169. j2 := c.top()
  170. c.setSplit(split, j1, j2)
  171. case syntax.OpRepeat:
  172. if ast.Max == -1 {
  173. for i := 0; i < ast.Min; i++ {
  174. err := c.c(ast.Sub[0])
  175. if err != nil {
  176. return err
  177. }
  178. }
  179. next := syntax.Regexp{
  180. Op: syntax.OpStar,
  181. Flags: ast.Flags,
  182. Sub: ast.Sub,
  183. Sub0: ast.Sub0,
  184. Rune: ast.Rune,
  185. Rune0: ast.Rune0,
  186. }
  187. return c.c(&next)
  188. }
  189. for i := 0; i < ast.Min; i++ {
  190. err := c.c(ast.Sub[0])
  191. if err != nil {
  192. return err
  193. }
  194. }
  195. splits := make([]uint, 0, ast.Max-ast.Min)
  196. starts := make([]uint, 0, ast.Max-ast.Min)
  197. for i := ast.Min; i < ast.Max; i++ {
  198. splits = append(splits, c.emptySplit())
  199. starts = append(starts, uint(len(c.insts)))
  200. err := c.c(ast.Sub[0])
  201. if err != nil {
  202. return err
  203. }
  204. }
  205. end := uint(len(c.insts))
  206. for i := 0; i < len(splits); i++ {
  207. c.setSplit(splits[i], starts[i], end)
  208. }
  209. }
  210. return c.checkSize()
  211. }
  212. func (c *compiler) checkSize() error {
  213. if uint(len(c.insts)*instSize) > c.sizeLimit {
  214. return ErrCompiledTooBig
  215. }
  216. return nil
  217. }
  218. func (c *compiler) compileClass(ast *syntax.Regexp) error {
  219. if len(ast.Rune) == 0 {
  220. return nil
  221. }
  222. jmps := make([]uint, 0, len(ast.Rune)-2)
  223. // does not do last pair
  224. for i := 0; i < len(ast.Rune)-2; i += 2 {
  225. rstart := ast.Rune[i]
  226. rend := ast.Rune[i+1]
  227. split := c.emptySplit()
  228. j1 := c.top()
  229. err := c.compileClassRange(rstart, rend)
  230. if err != nil {
  231. return err
  232. }
  233. jmps = append(jmps, c.emptyJump())
  234. j2 := c.top()
  235. c.setSplit(split, j1, j2)
  236. }
  237. // handle last pair
  238. rstart := ast.Rune[len(ast.Rune)-2]
  239. rend := ast.Rune[len(ast.Rune)-1]
  240. err := c.compileClassRange(rstart, rend)
  241. if err != nil {
  242. return err
  243. }
  244. end := c.top()
  245. for _, jmp := range jmps {
  246. c.setJump(jmp, end)
  247. }
  248. return nil
  249. }
  250. func (c *compiler) compileClassRange(startR, endR rune) (err error) {
  251. c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
  252. startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
  253. if err != nil {
  254. return err
  255. }
  256. jmps := make([]uint, 0, len(c.sequences)-1)
  257. // does not do last entry
  258. for i := 0; i < len(c.sequences)-1; i++ {
  259. seq := c.sequences[i]
  260. split := c.emptySplit()
  261. j1 := c.top()
  262. c.compileUtf8Ranges(seq)
  263. jmps = append(jmps, c.emptyJump())
  264. j2 := c.top()
  265. c.setSplit(split, j1, j2)
  266. }
  267. // handle last entry
  268. c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
  269. end := c.top()
  270. for _, jmp := range jmps {
  271. c.setJump(jmp, end)
  272. }
  273. return nil
  274. }
  275. func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
  276. for _, r := range seq {
  277. inst := c.allocInst()
  278. inst.op = OpRange
  279. inst.rangeStart = r.Start
  280. inst.rangeEnd = r.End
  281. c.insts = append(c.insts, inst)
  282. }
  283. }
  284. func (c *compiler) emptySplit() uint {
  285. inst := c.allocInst()
  286. inst.op = OpSplit
  287. c.insts = append(c.insts, inst)
  288. return c.top() - 1
  289. }
  290. func (c *compiler) emptyJump() uint {
  291. inst := c.allocInst()
  292. inst.op = OpJmp
  293. c.insts = append(c.insts, inst)
  294. return c.top() - 1
  295. }
  296. func (c *compiler) setSplit(i, pc1, pc2 uint) {
  297. split := c.insts[i]
  298. split.splitA = pc1
  299. split.splitB = pc2
  300. }
  301. func (c *compiler) setJump(i, pc uint) {
  302. jmp := c.insts[i]
  303. jmp.to = pc
  304. }
  305. func (c *compiler) top() uint {
  306. return uint(len(c.insts))
  307. }
  308. func (c *compiler) allocInst() *inst {
  309. if len(c.instsPool) <= 0 {
  310. c.instsPool = make([]inst, 16)
  311. }
  312. inst := &c.instsPool[0]
  313. c.instsPool = c.instsPool[1:]
  314. return inst
  315. }