summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/couchbase/vellum/regexp/compile.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/couchbase/vellum/regexp/compile.go')
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/compile.go316
1 files changed, 316 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go
new file mode 100644
index 0000000000..6922b749db
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/compile.go
@@ -0,0 +1,316 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+ "regexp/syntax"
+ "unicode"
+
+ "github.com/couchbase/vellum/utf8"
+)
+
+type compiler struct {
+ sizeLimit uint
+ insts prog
+}
+
+func newCompiler(sizeLimit uint) *compiler {
+ return &compiler{
+ sizeLimit: sizeLimit,
+ }
+}
+
+func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
+ err := c.c(ast)
+ if err != nil {
+ return nil, err
+ }
+ c.insts = append(c.insts, &inst{
+ op: OpMatch,
+ })
+ return c.insts, nil
+}
+
+func (c *compiler) c(ast *syntax.Regexp) error {
+ if ast.Flags&syntax.NonGreedy > 1 {
+ return ErrNoLazy
+ }
+
+ switch ast.Op {
+ case syntax.OpEndLine, syntax.OpBeginLine,
+ syntax.OpBeginText, syntax.OpEndText:
+ return ErrNoEmpty
+ case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
+ return ErrNoWordBoundary
+ case syntax.OpEmptyMatch:
+ return nil
+ case syntax.OpLiteral:
+ for _, r := range ast.Rune {
+ if ast.Flags&syntax.FoldCase > 0 {
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune0: [2]rune{r, r},
+ }
+ next.Rune = next.Rune0[0:2]
+ return c.c(&next)
+ }
+ seqs, err := utf8.NewSequences(r, r)
+ if err != nil {
+ return err
+ }
+ for _, seq := range seqs {
+ c.compileUtf8Ranges(seq)
+ }
+ }
+ case syntax.OpAnyChar:
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune0: [2]rune{0, unicode.MaxRune},
+ }
+ next.Rune = next.Rune0[:2]
+ return c.c(&next)
+ case syntax.OpAnyCharNotNL:
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
+ }
+ return c.c(&next)
+ case syntax.OpCharClass:
+ return c.compileClass(ast)
+ case syntax.OpCapture:
+ return c.c(ast.Sub[0])
+ case syntax.OpConcat:
+ for _, sub := range ast.Sub {
+ err := c.c(sub)
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+ case syntax.OpAlternate:
+ if len(ast.Sub) == 0 {
+ return nil
+ }
+ jmpsToEnd := []uint{}
+
+ // does not handle last entry
+ for i := 0; i < len(ast.Sub)-1; i++ {
+ sub := ast.Sub[i]
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.c(sub)
+ if err != nil {
+ return err
+ }
+ jmpsToEnd = append(jmpsToEnd, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last entry
+ err := c.c(ast.Sub[len(ast.Sub)-1])
+ if err != nil {
+ return err
+ }
+ end := uint(len(c.insts))
+ for _, jmpToEnd := range jmpsToEnd {
+ c.setJump(jmpToEnd, end)
+ }
+ case syntax.OpQuest:
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+
+ case syntax.OpStar:
+ j1 := c.top()
+ split := c.emptySplit()
+ j2 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ jmp := c.emptyJump()
+ j3 := uint(len(c.insts))
+
+ c.setJump(jmp, j1)
+ c.setSplit(split, j2, j3)
+
+ case syntax.OpPlus:
+ j1 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ split := c.emptySplit()
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+
+ case syntax.OpRepeat:
+ if ast.Max == -1 {
+ for i := 0; i < ast.Min; i++ {
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ next := syntax.Regexp{
+ Op: syntax.OpStar,
+ Flags: ast.Flags,
+ Sub: ast.Sub,
+ Sub0: ast.Sub0,
+ Rune: ast.Rune,
+ Rune0: ast.Rune0,
+ }
+ return c.c(&next)
+ }
+ for i := 0; i < ast.Min; i++ {
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ var splits, starts []uint
+ for i := ast.Min; i < ast.Max; i++ {
+ splits = append(splits, c.emptySplit())
+ starts = append(starts, uint(len(c.insts)))
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ end := uint(len(c.insts))
+ for i := 0; i < len(splits); i++ {
+ c.setSplit(splits[i], starts[i], end)
+ }
+
+ }
+
+ return c.checkSize()
+}
+
+func (c *compiler) checkSize() error {
+ if uint(len(c.insts)*instSize) > c.sizeLimit {
+ return ErrCompiledTooBig
+ }
+ return nil
+}
+
+func (c *compiler) compileClass(ast *syntax.Regexp) error {
+ if len(ast.Rune) == 0 {
+ return nil
+ }
+ var jmps []uint
+
+ // does not do last pair
+ for i := 0; i < len(ast.Rune)-2; i += 2 {
+ rstart := ast.Rune[i]
+ rend := ast.Rune[i+1]
+
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.compileClassRange(rstart, rend)
+ if err != nil {
+ return err
+ }
+ jmps = append(jmps, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last pair
+ rstart := ast.Rune[len(ast.Rune)-2]
+ rend := ast.Rune[len(ast.Rune)-1]
+ err := c.compileClassRange(rstart, rend)
+ if err != nil {
+ return err
+ }
+ end := c.top()
+ for _, jmp := range jmps {
+ c.setJump(jmp, end)
+ }
+ return nil
+}
+
+func (c *compiler) compileClassRange(startR, endR rune) error {
+ seqs, err := utf8.NewSequences(startR, endR)
+ if err != nil {
+ return err
+ }
+ var jmps []uint
+
+ // does not do last entry
+ for i := 0; i < len(seqs)-1; i++ {
+ seq := seqs[i]
+ split := c.emptySplit()
+ j1 := c.top()
+ c.compileUtf8Ranges(seq)
+ jmps = append(jmps, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last entry
+ c.compileUtf8Ranges(seqs[len(seqs)-1])
+ end := c.top()
+ for _, jmp := range jmps {
+ c.setJump(jmp, end)
+ }
+
+ return nil
+}
+
+func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
+ for _, r := range seq {
+ c.insts = append(c.insts, &inst{
+ op: OpRange,
+ rangeStart: r.Start,
+ rangeEnd: r.End,
+ })
+ }
+}
+
+func (c *compiler) emptySplit() uint {
+ c.insts = append(c.insts, &inst{
+ op: OpSplit,
+ })
+ return c.top() - 1
+}
+
+func (c *compiler) emptyJump() uint {
+ c.insts = append(c.insts, &inst{
+ op: OpJmp,
+ })
+ return c.top() - 1
+}
+
+func (c *compiler) setSplit(i, pc1, pc2 uint) {
+ split := c.insts[i]
+ split.splitA = pc1
+ split.splitB = pc2
+}
+
+func (c *compiler) setJump(i, pc uint) {
+ jmp := c.insts[i]
+ jmp.to = pc
+}
+
+func (c *compiler) top() uint {
+ return uint(len(c.insts))
+}