aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/couchbase/vellum/regexp
diff options
context:
space:
mode:
authorAntoine GIRARD <sapk@users.noreply.github.com>2018-05-19 14:49:46 +0200
committerLunny Xiao <xiaolunwen@gmail.com>2018-05-19 20:49:46 +0800
commit917b9641eca3fa1b1676ba1b4fd77a4e958ee153 (patch)
tree2caf049dfebccf5ccbc44316630a6c9220062d78 /vendor/github.com/couchbase/vellum/regexp
parent1b7cd3d0b0d3652e0660489b9c4da72619400c98 (diff)
downloadgitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.tar.gz
gitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.zip
Update to last common bleve (#3986)
Diffstat (limited to 'vendor/github.com/couchbase/vellum/regexp')
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/compile.go316
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/dfa.go188
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/inst.go62
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/regexp.go113
-rw-r--r--vendor/github.com/couchbase/vellum/regexp/sparse.go54
5 files changed, 733 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go
new file mode 100644
index 0000000000..6922b749db
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/compile.go
@@ -0,0 +1,316 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+ "regexp/syntax"
+ "unicode"
+
+ "github.com/couchbase/vellum/utf8"
+)
+
+type compiler struct {
+ sizeLimit uint
+ insts prog
+}
+
+func newCompiler(sizeLimit uint) *compiler {
+ return &compiler{
+ sizeLimit: sizeLimit,
+ }
+}
+
+func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
+ err := c.c(ast)
+ if err != nil {
+ return nil, err
+ }
+ c.insts = append(c.insts, &inst{
+ op: OpMatch,
+ })
+ return c.insts, nil
+}
+
+func (c *compiler) c(ast *syntax.Regexp) error {
+ if ast.Flags&syntax.NonGreedy > 1 {
+ return ErrNoLazy
+ }
+
+ switch ast.Op {
+ case syntax.OpEndLine, syntax.OpBeginLine,
+ syntax.OpBeginText, syntax.OpEndText:
+ return ErrNoEmpty
+ case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
+ return ErrNoWordBoundary
+ case syntax.OpEmptyMatch:
+ return nil
+ case syntax.OpLiteral:
+ for _, r := range ast.Rune {
+ if ast.Flags&syntax.FoldCase > 0 {
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune0: [2]rune{r, r},
+ }
+ next.Rune = next.Rune0[0:2]
+ return c.c(&next)
+ }
+ seqs, err := utf8.NewSequences(r, r)
+ if err != nil {
+ return err
+ }
+ for _, seq := range seqs {
+ c.compileUtf8Ranges(seq)
+ }
+ }
+ case syntax.OpAnyChar:
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune0: [2]rune{0, unicode.MaxRune},
+ }
+ next.Rune = next.Rune0[:2]
+ return c.c(&next)
+ case syntax.OpAnyCharNotNL:
+ next := syntax.Regexp{
+ Op: syntax.OpCharClass,
+ Flags: ast.Flags & syntax.FoldCase,
+ Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
+ }
+ return c.c(&next)
+ case syntax.OpCharClass:
+ return c.compileClass(ast)
+ case syntax.OpCapture:
+ return c.c(ast.Sub[0])
+ case syntax.OpConcat:
+ for _, sub := range ast.Sub {
+ err := c.c(sub)
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+ case syntax.OpAlternate:
+ if len(ast.Sub) == 0 {
+ return nil
+ }
+ jmpsToEnd := []uint{}
+
+ // does not handle last entry
+ for i := 0; i < len(ast.Sub)-1; i++ {
+ sub := ast.Sub[i]
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.c(sub)
+ if err != nil {
+ return err
+ }
+ jmpsToEnd = append(jmpsToEnd, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last entry
+ err := c.c(ast.Sub[len(ast.Sub)-1])
+ if err != nil {
+ return err
+ }
+ end := uint(len(c.insts))
+ for _, jmpToEnd := range jmpsToEnd {
+ c.setJump(jmpToEnd, end)
+ }
+ case syntax.OpQuest:
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+
+ case syntax.OpStar:
+ j1 := c.top()
+ split := c.emptySplit()
+ j2 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ jmp := c.emptyJump()
+ j3 := uint(len(c.insts))
+
+ c.setJump(jmp, j1)
+ c.setSplit(split, j2, j3)
+
+ case syntax.OpPlus:
+ j1 := c.top()
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ split := c.emptySplit()
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+
+ case syntax.OpRepeat:
+ if ast.Max == -1 {
+ for i := 0; i < ast.Min; i++ {
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ next := syntax.Regexp{
+ Op: syntax.OpStar,
+ Flags: ast.Flags,
+ Sub: ast.Sub,
+ Sub0: ast.Sub0,
+ Rune: ast.Rune,
+ Rune0: ast.Rune0,
+ }
+ return c.c(&next)
+ }
+ for i := 0; i < ast.Min; i++ {
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ var splits, starts []uint
+ for i := ast.Min; i < ast.Max; i++ {
+ splits = append(splits, c.emptySplit())
+ starts = append(starts, uint(len(c.insts)))
+ err := c.c(ast.Sub[0])
+ if err != nil {
+ return err
+ }
+ }
+ end := uint(len(c.insts))
+ for i := 0; i < len(splits); i++ {
+ c.setSplit(splits[i], starts[i], end)
+ }
+
+ }
+
+ return c.checkSize()
+}
+
+func (c *compiler) checkSize() error {
+ if uint(len(c.insts)*instSize) > c.sizeLimit {
+ return ErrCompiledTooBig
+ }
+ return nil
+}
+
+func (c *compiler) compileClass(ast *syntax.Regexp) error {
+ if len(ast.Rune) == 0 {
+ return nil
+ }
+ var jmps []uint
+
+ // does not do last pair
+ for i := 0; i < len(ast.Rune)-2; i += 2 {
+ rstart := ast.Rune[i]
+ rend := ast.Rune[i+1]
+
+ split := c.emptySplit()
+ j1 := c.top()
+ err := c.compileClassRange(rstart, rend)
+ if err != nil {
+ return err
+ }
+ jmps = append(jmps, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last pair
+ rstart := ast.Rune[len(ast.Rune)-2]
+ rend := ast.Rune[len(ast.Rune)-1]
+ err := c.compileClassRange(rstart, rend)
+ if err != nil {
+ return err
+ }
+ end := c.top()
+ for _, jmp := range jmps {
+ c.setJump(jmp, end)
+ }
+ return nil
+}
+
+func (c *compiler) compileClassRange(startR, endR rune) error {
+ seqs, err := utf8.NewSequences(startR, endR)
+ if err != nil {
+ return err
+ }
+ var jmps []uint
+
+ // does not do last entry
+ for i := 0; i < len(seqs)-1; i++ {
+ seq := seqs[i]
+ split := c.emptySplit()
+ j1 := c.top()
+ c.compileUtf8Ranges(seq)
+ jmps = append(jmps, c.emptyJump())
+ j2 := c.top()
+ c.setSplit(split, j1, j2)
+ }
+ // handle last entry
+ c.compileUtf8Ranges(seqs[len(seqs)-1])
+ end := c.top()
+ for _, jmp := range jmps {
+ c.setJump(jmp, end)
+ }
+
+ return nil
+}
+
+func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
+ for _, r := range seq {
+ c.insts = append(c.insts, &inst{
+ op: OpRange,
+ rangeStart: r.Start,
+ rangeEnd: r.End,
+ })
+ }
+}
+
+func (c *compiler) emptySplit() uint {
+ c.insts = append(c.insts, &inst{
+ op: OpSplit,
+ })
+ return c.top() - 1
+}
+
+func (c *compiler) emptyJump() uint {
+ c.insts = append(c.insts, &inst{
+ op: OpJmp,
+ })
+ return c.top() - 1
+}
+
+func (c *compiler) setSplit(i, pc1, pc2 uint) {
+ split := c.insts[i]
+ split.splitA = pc1
+ split.splitB = pc2
+}
+
+func (c *compiler) setJump(i, pc uint) {
+ jmp := c.insts[i]
+ jmp.to = pc
+}
+
+func (c *compiler) top() uint {
+ return uint(len(c.insts))
+}
diff --git a/vendor/github.com/couchbase/vellum/regexp/dfa.go b/vendor/github.com/couchbase/vellum/regexp/dfa.go
new file mode 100644
index 0000000000..9864606b6a
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go
@@ -0,0 +1,188 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+ "encoding/binary"
+ "fmt"
+)
+
+// StateLimit is the maximum number of states allowed
+const StateLimit = 10000
+
+// ErrTooManyStates is returned if you attempt to build a Levenshtein
+// automaton which requries too many states.
+var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
+ StateLimit)
+
+type dfaBuilder struct {
+ dfa *dfa
+ cache map[string]int
+ keyBuf []byte
+}
+
+func newDfaBuilder(insts prog) *dfaBuilder {
+ d := &dfaBuilder{
+ dfa: &dfa{
+ insts: insts,
+ states: make([]*state, 0, 16),
+ },
+ cache: make(map[string]int, 1024),
+ }
+ // add 0 state that is invalid
+ d.dfa.states = append(d.dfa.states, &state{
+ next: make([]int, 256),
+ match: false,
+ })
+ return d
+}
+
+func (d *dfaBuilder) build() (*dfa, error) {
+ cur := newSparseSet(uint(len(d.dfa.insts)))
+ next := newSparseSet(uint(len(d.dfa.insts)))
+
+ d.dfa.add(cur, 0)
+ states := intStack{d.cachedState(cur)}
+ seen := make(map[int]struct{})
+ var s int
+ states, s = states.Pop()
+ for s != 0 {
+ for b := 0; b < 256; b++ {
+ ns := d.runState(cur, next, s, byte(b))
+ if ns != 0 {
+ if _, ok := seen[ns]; !ok {
+ seen[ns] = struct{}{}
+ states = states.Push(ns)
+ }
+ }
+ if len(d.dfa.states) > StateLimit {
+ return nil, ErrTooManyStates
+ }
+ }
+ states, s = states.Pop()
+ }
+ return d.dfa, nil
+}
+
+func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int {
+ cur.Clear()
+ for _, ip := range d.dfa.states[state].insts {
+ cur.Add(ip)
+ }
+ d.dfa.run(cur, next, b)
+ nextState := d.cachedState(next)
+ d.dfa.states[state].next[b] = nextState
+ return nextState
+}
+
+func instsKey(insts []uint, buf []byte) []byte {
+ if cap(buf) < 8*len(insts) {
+ buf = make([]byte, 8*len(insts))
+ } else {
+ buf = buf[0 : 8*len(insts)]
+ }
+ for i, inst := range insts {
+ binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
+ }
+ return buf
+}
+
+func (d *dfaBuilder) cachedState(set *sparseSet) int {
+ var insts []uint
+ var isMatch bool
+ for i := uint(0); i < uint(set.Len()); i++ {
+ ip := set.Get(i)
+ switch d.dfa.insts[ip].op {
+ case OpRange:
+ insts = append(insts, ip)
+ case OpMatch:
+ isMatch = true
+ insts = append(insts, ip)
+ }
+ }
+ if len(insts) == 0 {
+ return 0
+ }
+ d.keyBuf = instsKey(insts, d.keyBuf)
+ v, ok := d.cache[string(d.keyBuf)]
+ if ok {
+ return v
+ }
+ d.dfa.states = append(d.dfa.states, &state{
+ insts: insts,
+ next: make([]int, 256),
+ match: isMatch,
+ })
+ newV := len(d.dfa.states) - 1
+ d.cache[string(d.keyBuf)] = newV
+ return newV
+}
+
+type dfa struct {
+ insts prog
+ states []*state
+}
+
+func (d *dfa) add(set *sparseSet, ip uint) {
+ if set.Contains(ip) {
+ return
+ }
+ set.Add(ip)
+ switch d.insts[ip].op {
+ case OpJmp:
+ d.add(set, d.insts[ip].to)
+ case OpSplit:
+ d.add(set, d.insts[ip].splitA)
+ d.add(set, d.insts[ip].splitB)
+ }
+}
+
+func (d *dfa) run(from, to *sparseSet, b byte) bool {
+ to.Clear()
+ var isMatch bool
+ for i := uint(0); i < uint(from.Len()); i++ {
+ ip := from.Get(i)
+ switch d.insts[ip].op {
+ case OpMatch:
+ isMatch = true
+ case OpRange:
+ if d.insts[ip].rangeStart <= b &&
+ b <= d.insts[ip].rangeEnd {
+ d.add(to, ip+1)
+ }
+ }
+ }
+ return isMatch
+}
+
+type state struct {
+ insts []uint
+ next []int
+ match bool
+}
+
+type intStack []int
+
+func (s intStack) Push(v int) intStack {
+ return append(s, v)
+}
+
+func (s intStack) Pop() (intStack, int) {
+ l := len(s)
+ if l < 1 {
+ return s, 0
+ }
+ return s[:l-1], s[l-1]
+}
diff --git a/vendor/github.com/couchbase/vellum/regexp/inst.go b/vendor/github.com/couchbase/vellum/regexp/inst.go
new file mode 100644
index 0000000000..61cbf2f333
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/inst.go
@@ -0,0 +1,62 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import "fmt"
+
+// instOp represents a instruction operation
+type instOp int
+
+// the enumeration of operations
+const (
+ OpMatch instOp = iota
+ OpJmp
+ OpSplit
+ OpRange
+)
+
+// instSize is the approxmiate size of the an inst struct in bytes
+const instSize = 40
+
+type inst struct {
+ op instOp
+ to uint
+ splitA uint
+ splitB uint
+ rangeStart byte
+ rangeEnd byte
+}
+
+func (i *inst) String() string {
+ switch i.op {
+ case OpJmp:
+ return fmt.Sprintf("JMP: %d", i.to)
+ case OpSplit:
+ return fmt.Sprintf("SPLIT: %d - %d", i.splitA, i.splitB)
+ case OpRange:
+ return fmt.Sprintf("RANGE: %x - %x", i.rangeStart, i.rangeEnd)
+ }
+ return "MATCH"
+}
+
+type prog []*inst
+
+func (p prog) String() string {
+ rv := "\n"
+ for i, pi := range p {
+ rv += fmt.Sprintf("%d %v\n", i, pi)
+ }
+ return rv
+}
diff --git a/vendor/github.com/couchbase/vellum/regexp/regexp.go b/vendor/github.com/couchbase/vellum/regexp/regexp.go
new file mode 100644
index 0000000000..ed0e7823e1
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/regexp.go
@@ -0,0 +1,113 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+ "fmt"
+ "regexp/syntax"
+)
+
+// ErrNoEmpty returned when "zero width assertions" are used
+var ErrNoEmpty = fmt.Errorf("zero width assertions not allowed")
+
+// ErrNoWordBoundary returned when word boundaries are used
+var ErrNoWordBoundary = fmt.Errorf("word boundaries are not allowed")
+
+// ErrNoBytes returned when byte literals are used
+var ErrNoBytes = fmt.Errorf("byte literals are not allowed")
+
+// ErrNoLazy returned when lazy quantifiers are used
+var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed")
+
+// ErrCompiledTooBig returned when regular expression parses into
+// too many instructions
+var ErrCompiledTooBig = fmt.Errorf("too many instructions")
+
+// Regexp implements the vellum.Automaton interface for matcing a user
+// specified regular expression.
+type Regexp struct {
+ orig string
+ dfa *dfa
+}
+
+// NewRegexp creates a new Regular Expression automaton with the specified
+// expression. By default it is limited to approximately 10MB for the
+// compiled finite state automaton. If this size is exceeded,
+// ErrCompiledTooBig will be returned.
+func New(expr string) (*Regexp, error) {
+ return NewWithLimit(expr, 10*(1<<20))
+}
+
+// NewRegexpWithLimit creates a new Regular Expression automaton with
+// the specified expression. The size of the compiled finite state
+// automaton exceeds the user specified size, ErrCompiledTooBig will be
+// returned.
+func NewWithLimit(expr string, size uint) (*Regexp, error) {
+ parsed, err := syntax.Parse(expr, syntax.Perl)
+ if err != nil {
+ return nil, err
+ }
+ compiler := newCompiler(size)
+ insts, err := compiler.compile(parsed)
+ if err != nil {
+ return nil, err
+ }
+ dfaBuilder := newDfaBuilder(insts)
+ dfa, err := dfaBuilder.build()
+ if err != nil {
+ return nil, err
+ }
+ return &Regexp{
+ orig: expr,
+ dfa: dfa,
+ }, nil
+}
+
+// Start returns the start state of this automaton.
+func (r *Regexp) Start() int {
+ return 1
+}
+
+// IsMatch returns if the specified state is a matching state.
+func (r *Regexp) IsMatch(s int) bool {
+ if s < len(r.dfa.states) {
+ return r.dfa.states[s].match
+ }
+ return false
+}
+
+// CanMatch returns if the specified state can ever transition to a matching
+// state.
+func (r *Regexp) CanMatch(s int) bool {
+ if s < len(r.dfa.states) && s > 0 {
+ return true
+ }
+ return false
+}
+
+// WillAlwaysMatch returns if the specified state will always end in a
+// matching state.
+func (r *Regexp) WillAlwaysMatch(int) bool {
+ return false
+}
+
+// Accept returns the new state, resulting from the transite byte b
+// when currently in the state s.
+func (r *Regexp) Accept(s int, b byte) int {
+ if s < len(r.dfa.states) {
+ return r.dfa.states[s].next[b]
+ }
+ return 0
+}
diff --git a/vendor/github.com/couchbase/vellum/regexp/sparse.go b/vendor/github.com/couchbase/vellum/regexp/sparse.go
new file mode 100644
index 0000000000..7afbfceba6
--- /dev/null
+++ b/vendor/github.com/couchbase/vellum/regexp/sparse.go
@@ -0,0 +1,54 @@
+// Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+type sparseSet struct {
+ dense []uint
+ sparse []uint
+ size uint
+}
+
+func newSparseSet(size uint) *sparseSet {
+ return &sparseSet{
+ dense: make([]uint, size),
+ sparse: make([]uint, size),
+ size: 0,
+ }
+}
+
+func (s *sparseSet) Len() int {
+ return int(s.size)
+}
+
+func (s *sparseSet) Add(ip uint) uint {
+ i := s.size
+ s.dense[i] = ip
+ s.sparse[ip] = i
+ s.size++
+ return i
+}
+
+func (s *sparseSet) Get(i uint) uint {
+ return s.dense[i]
+}
+
+func (s *sparseSet) Contains(ip uint) bool {
+ i := s.sparse[ip]
+ return i < s.size && s.dense[i] == ip
+}
+
+func (s *sparseSet) Clear() {
+ s.size = 0
+}