diff options
author | Antoine GIRARD <sapk@users.noreply.github.com> | 2018-05-19 14:49:46 +0200 |
---|---|---|
committer | Lunny Xiao <xiaolunwen@gmail.com> | 2018-05-19 20:49:46 +0800 |
commit | 917b9641eca3fa1b1676ba1b4fd77a4e958ee153 (patch) | |
tree | 2caf049dfebccf5ccbc44316630a6c9220062d78 /vendor/github.com/couchbase/vellum/regexp | |
parent | 1b7cd3d0b0d3652e0660489b9c4da72619400c98 (diff) | |
download | gitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.tar.gz gitea-917b9641eca3fa1b1676ba1b4fd77a4e958ee153.zip |
Update to last common bleve (#3986)
Diffstat (limited to 'vendor/github.com/couchbase/vellum/regexp')
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/compile.go | 316 | ||||
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/dfa.go | 188 | ||||
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/inst.go | 62 | ||||
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/regexp.go | 113 | ||||
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/sparse.go | 54 |
5 files changed, 733 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/regexp/compile.go b/vendor/github.com/couchbase/vellum/regexp/compile.go new file mode 100644 index 0000000000..6922b749db --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/compile.go @@ -0,0 +1,316 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +import ( + "regexp/syntax" + "unicode" + + "github.com/couchbase/vellum/utf8" +) + +type compiler struct { + sizeLimit uint + insts prog +} + +func newCompiler(sizeLimit uint) *compiler { + return &compiler{ + sizeLimit: sizeLimit, + } +} + +func (c *compiler) compile(ast *syntax.Regexp) (prog, error) { + err := c.c(ast) + if err != nil { + return nil, err + } + c.insts = append(c.insts, &inst{ + op: OpMatch, + }) + return c.insts, nil +} + +func (c *compiler) c(ast *syntax.Regexp) error { + if ast.Flags&syntax.NonGreedy > 1 { + return ErrNoLazy + } + + switch ast.Op { + case syntax.OpEndLine, syntax.OpBeginLine, + syntax.OpBeginText, syntax.OpEndText: + return ErrNoEmpty + case syntax.OpWordBoundary, syntax.OpNoWordBoundary: + return ErrNoWordBoundary + case syntax.OpEmptyMatch: + return nil + case syntax.OpLiteral: + for _, r := range ast.Rune { + if ast.Flags&syntax.FoldCase > 0 { + next := syntax.Regexp{ + Op: syntax.OpCharClass, + Flags: ast.Flags & syntax.FoldCase, + Rune0: [2]rune{r, r}, + } + next.Rune = next.Rune0[0:2] + return c.c(&next) + } + seqs, err := utf8.NewSequences(r, r) + if err != nil { + return err + } + for _, seq := range seqs { + c.compileUtf8Ranges(seq) + } + } + case syntax.OpAnyChar: + next := syntax.Regexp{ + Op: syntax.OpCharClass, + Flags: ast.Flags & syntax.FoldCase, + Rune0: [2]rune{0, unicode.MaxRune}, + } + next.Rune = next.Rune0[:2] + return c.c(&next) + case syntax.OpAnyCharNotNL: + next := syntax.Regexp{ + Op: syntax.OpCharClass, + Flags: ast.Flags & syntax.FoldCase, + Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune}, + } + return c.c(&next) + case syntax.OpCharClass: + return c.compileClass(ast) + case syntax.OpCapture: + return c.c(ast.Sub[0]) + case syntax.OpConcat: + for _, sub := range ast.Sub { + err := c.c(sub) + if err != nil { + return err + } + } + return nil + case syntax.OpAlternate: + if len(ast.Sub) == 0 { + return nil + } + jmpsToEnd := []uint{} + + // does not handle last entry + for i := 0; i < len(ast.Sub)-1; i++ { + sub := ast.Sub[i] + split := c.emptySplit() + j1 := c.top() + err := c.c(sub) + if err != nil { + return err + } + jmpsToEnd = append(jmpsToEnd, c.emptyJump()) + j2 := c.top() + c.setSplit(split, j1, j2) + } + // handle last entry + err := c.c(ast.Sub[len(ast.Sub)-1]) + if err != nil { + return err + } + end := uint(len(c.insts)) + for _, jmpToEnd := range jmpsToEnd { + c.setJump(jmpToEnd, end) + } + case syntax.OpQuest: + split := c.emptySplit() + j1 := c.top() + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + j2 := c.top() + c.setSplit(split, j1, j2) + + case syntax.OpStar: + j1 := c.top() + split := c.emptySplit() + j2 := c.top() + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + jmp := c.emptyJump() + j3 := uint(len(c.insts)) + + c.setJump(jmp, j1) + c.setSplit(split, j2, j3) + + case syntax.OpPlus: + j1 := c.top() + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + split := c.emptySplit() + j2 := c.top() + c.setSplit(split, j1, j2) + + case syntax.OpRepeat: + if ast.Max == -1 { + for i := 0; i < ast.Min; i++ { + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + } + next := syntax.Regexp{ + Op: syntax.OpStar, + Flags: ast.Flags, + Sub: ast.Sub, + Sub0: ast.Sub0, + Rune: ast.Rune, + Rune0: ast.Rune0, + } + return c.c(&next) + } + for i := 0; i < ast.Min; i++ { + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + } + var splits, starts []uint + for i := ast.Min; i < ast.Max; i++ { + splits = append(splits, c.emptySplit()) + starts = append(starts, uint(len(c.insts))) + err := c.c(ast.Sub[0]) + if err != nil { + return err + } + } + end := uint(len(c.insts)) + for i := 0; i < len(splits); i++ { + c.setSplit(splits[i], starts[i], end) + } + + } + + return c.checkSize() +} + +func (c *compiler) checkSize() error { + if uint(len(c.insts)*instSize) > c.sizeLimit { + return ErrCompiledTooBig + } + return nil +} + +func (c *compiler) compileClass(ast *syntax.Regexp) error { + if len(ast.Rune) == 0 { + return nil + } + var jmps []uint + + // does not do last pair + for i := 0; i < len(ast.Rune)-2; i += 2 { + rstart := ast.Rune[i] + rend := ast.Rune[i+1] + + split := c.emptySplit() + j1 := c.top() + err := c.compileClassRange(rstart, rend) + if err != nil { + return err + } + jmps = append(jmps, c.emptyJump()) + j2 := c.top() + c.setSplit(split, j1, j2) + } + // handle last pair + rstart := ast.Rune[len(ast.Rune)-2] + rend := ast.Rune[len(ast.Rune)-1] + err := c.compileClassRange(rstart, rend) + if err != nil { + return err + } + end := c.top() + for _, jmp := range jmps { + c.setJump(jmp, end) + } + return nil +} + +func (c *compiler) compileClassRange(startR, endR rune) error { + seqs, err := utf8.NewSequences(startR, endR) + if err != nil { + return err + } + var jmps []uint + + // does not do last entry + for i := 0; i < len(seqs)-1; i++ { + seq := seqs[i] + split := c.emptySplit() + j1 := c.top() + c.compileUtf8Ranges(seq) + jmps = append(jmps, c.emptyJump()) + j2 := c.top() + c.setSplit(split, j1, j2) + } + // handle last entry + c.compileUtf8Ranges(seqs[len(seqs)-1]) + end := c.top() + for _, jmp := range jmps { + c.setJump(jmp, end) + } + + return nil +} + +func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) { + for _, r := range seq { + c.insts = append(c.insts, &inst{ + op: OpRange, + rangeStart: r.Start, + rangeEnd: r.End, + }) + } +} + +func (c *compiler) emptySplit() uint { + c.insts = append(c.insts, &inst{ + op: OpSplit, + }) + return c.top() - 1 +} + +func (c *compiler) emptyJump() uint { + c.insts = append(c.insts, &inst{ + op: OpJmp, + }) + return c.top() - 1 +} + +func (c *compiler) setSplit(i, pc1, pc2 uint) { + split := c.insts[i] + split.splitA = pc1 + split.splitB = pc2 +} + +func (c *compiler) setJump(i, pc uint) { + jmp := c.insts[i] + jmp.to = pc +} + +func (c *compiler) top() uint { + return uint(len(c.insts)) +} diff --git a/vendor/github.com/couchbase/vellum/regexp/dfa.go b/vendor/github.com/couchbase/vellum/regexp/dfa.go new file mode 100644 index 0000000000..9864606b6a --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go @@ -0,0 +1,188 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +import ( + "encoding/binary" + "fmt" +) + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requries too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +type dfaBuilder struct { + dfa *dfa + cache map[string]int + keyBuf []byte +} + +func newDfaBuilder(insts prog) *dfaBuilder { + d := &dfaBuilder{ + dfa: &dfa{ + insts: insts, + states: make([]*state, 0, 16), + }, + cache: make(map[string]int, 1024), + } + // add 0 state that is invalid + d.dfa.states = append(d.dfa.states, &state{ + next: make([]int, 256), + match: false, + }) + return d +} + +func (d *dfaBuilder) build() (*dfa, error) { + cur := newSparseSet(uint(len(d.dfa.insts))) + next := newSparseSet(uint(len(d.dfa.insts))) + + d.dfa.add(cur, 0) + states := intStack{d.cachedState(cur)} + seen := make(map[int]struct{}) + var s int + states, s = states.Pop() + for s != 0 { + for b := 0; b < 256; b++ { + ns := d.runState(cur, next, s, byte(b)) + if ns != 0 { + if _, ok := seen[ns]; !ok { + seen[ns] = struct{}{} + states = states.Push(ns) + } + } + if len(d.dfa.states) > StateLimit { + return nil, ErrTooManyStates + } + } + states, s = states.Pop() + } + return d.dfa, nil +} + +func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int { + cur.Clear() + for _, ip := range d.dfa.states[state].insts { + cur.Add(ip) + } + d.dfa.run(cur, next, b) + nextState := d.cachedState(next) + d.dfa.states[state].next[b] = nextState + return nextState +} + +func instsKey(insts []uint, buf []byte) []byte { + if cap(buf) < 8*len(insts) { + buf = make([]byte, 8*len(insts)) + } else { + buf = buf[0 : 8*len(insts)] + } + for i, inst := range insts { + binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst)) + } + return buf +} + +func (d *dfaBuilder) cachedState(set *sparseSet) int { + var insts []uint + var isMatch bool + for i := uint(0); i < uint(set.Len()); i++ { + ip := set.Get(i) + switch d.dfa.insts[ip].op { + case OpRange: + insts = append(insts, ip) + case OpMatch: + isMatch = true + insts = append(insts, ip) + } + } + if len(insts) == 0 { + return 0 + } + d.keyBuf = instsKey(insts, d.keyBuf) + v, ok := d.cache[string(d.keyBuf)] + if ok { + return v + } + d.dfa.states = append(d.dfa.states, &state{ + insts: insts, + next: make([]int, 256), + match: isMatch, + }) + newV := len(d.dfa.states) - 1 + d.cache[string(d.keyBuf)] = newV + return newV +} + +type dfa struct { + insts prog + states []*state +} + +func (d *dfa) add(set *sparseSet, ip uint) { + if set.Contains(ip) { + return + } + set.Add(ip) + switch d.insts[ip].op { + case OpJmp: + d.add(set, d.insts[ip].to) + case OpSplit: + d.add(set, d.insts[ip].splitA) + d.add(set, d.insts[ip].splitB) + } +} + +func (d *dfa) run(from, to *sparseSet, b byte) bool { + to.Clear() + var isMatch bool + for i := uint(0); i < uint(from.Len()); i++ { + ip := from.Get(i) + switch d.insts[ip].op { + case OpMatch: + isMatch = true + case OpRange: + if d.insts[ip].rangeStart <= b && + b <= d.insts[ip].rangeEnd { + d.add(to, ip+1) + } + } + } + return isMatch +} + +type state struct { + insts []uint + next []int + match bool +} + +type intStack []int + +func (s intStack) Push(v int) intStack { + return append(s, v) +} + +func (s intStack) Pop() (intStack, int) { + l := len(s) + if l < 1 { + return s, 0 + } + return s[:l-1], s[l-1] +} diff --git a/vendor/github.com/couchbase/vellum/regexp/inst.go b/vendor/github.com/couchbase/vellum/regexp/inst.go new file mode 100644 index 0000000000..61cbf2f333 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/inst.go @@ -0,0 +1,62 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +import "fmt" + +// instOp represents a instruction operation +type instOp int + +// the enumeration of operations +const ( + OpMatch instOp = iota + OpJmp + OpSplit + OpRange +) + +// instSize is the approxmiate size of the an inst struct in bytes +const instSize = 40 + +type inst struct { + op instOp + to uint + splitA uint + splitB uint + rangeStart byte + rangeEnd byte +} + +func (i *inst) String() string { + switch i.op { + case OpJmp: + return fmt.Sprintf("JMP: %d", i.to) + case OpSplit: + return fmt.Sprintf("SPLIT: %d - %d", i.splitA, i.splitB) + case OpRange: + return fmt.Sprintf("RANGE: %x - %x", i.rangeStart, i.rangeEnd) + } + return "MATCH" +} + +type prog []*inst + +func (p prog) String() string { + rv := "\n" + for i, pi := range p { + rv += fmt.Sprintf("%d %v\n", i, pi) + } + return rv +} diff --git a/vendor/github.com/couchbase/vellum/regexp/regexp.go b/vendor/github.com/couchbase/vellum/regexp/regexp.go new file mode 100644 index 0000000000..ed0e7823e1 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/regexp.go @@ -0,0 +1,113 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +import ( + "fmt" + "regexp/syntax" +) + +// ErrNoEmpty returned when "zero width assertions" are used +var ErrNoEmpty = fmt.Errorf("zero width assertions not allowed") + +// ErrNoWordBoundary returned when word boundaries are used +var ErrNoWordBoundary = fmt.Errorf("word boundaries are not allowed") + +// ErrNoBytes returned when byte literals are used +var ErrNoBytes = fmt.Errorf("byte literals are not allowed") + +// ErrNoLazy returned when lazy quantifiers are used +var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed") + +// ErrCompiledTooBig returned when regular expression parses into +// too many instructions +var ErrCompiledTooBig = fmt.Errorf("too many instructions") + +// Regexp implements the vellum.Automaton interface for matcing a user +// specified regular expression. +type Regexp struct { + orig string + dfa *dfa +} + +// NewRegexp creates a new Regular Expression automaton with the specified +// expression. By default it is limited to approximately 10MB for the +// compiled finite state automaton. If this size is exceeded, +// ErrCompiledTooBig will be returned. +func New(expr string) (*Regexp, error) { + return NewWithLimit(expr, 10*(1<<20)) +} + +// NewRegexpWithLimit creates a new Regular Expression automaton with +// the specified expression. The size of the compiled finite state +// automaton exceeds the user specified size, ErrCompiledTooBig will be +// returned. +func NewWithLimit(expr string, size uint) (*Regexp, error) { + parsed, err := syntax.Parse(expr, syntax.Perl) + if err != nil { + return nil, err + } + compiler := newCompiler(size) + insts, err := compiler.compile(parsed) + if err != nil { + return nil, err + } + dfaBuilder := newDfaBuilder(insts) + dfa, err := dfaBuilder.build() + if err != nil { + return nil, err + } + return &Regexp{ + orig: expr, + dfa: dfa, + }, nil +} + +// Start returns the start state of this automaton. +func (r *Regexp) Start() int { + return 1 +} + +// IsMatch returns if the specified state is a matching state. +func (r *Regexp) IsMatch(s int) bool { + if s < len(r.dfa.states) { + return r.dfa.states[s].match + } + return false +} + +// CanMatch returns if the specified state can ever transition to a matching +// state. +func (r *Regexp) CanMatch(s int) bool { + if s < len(r.dfa.states) && s > 0 { + return true + } + return false +} + +// WillAlwaysMatch returns if the specified state will always end in a +// matching state. +func (r *Regexp) WillAlwaysMatch(int) bool { + return false +} + +// Accept returns the new state, resulting from the transite byte b +// when currently in the state s. +func (r *Regexp) Accept(s int, b byte) int { + if s < len(r.dfa.states) { + return r.dfa.states[s].next[b] + } + return 0 +} diff --git a/vendor/github.com/couchbase/vellum/regexp/sparse.go b/vendor/github.com/couchbase/vellum/regexp/sparse.go new file mode 100644 index 0000000000..7afbfceba6 --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/sparse.go @@ -0,0 +1,54 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +type sparseSet struct { + dense []uint + sparse []uint + size uint +} + +func newSparseSet(size uint) *sparseSet { + return &sparseSet{ + dense: make([]uint, size), + sparse: make([]uint, size), + size: 0, + } +} + +func (s *sparseSet) Len() int { + return int(s.size) +} + +func (s *sparseSet) Add(ip uint) uint { + i := s.size + s.dense[i] = ip + s.sparse[ip] = i + s.size++ + return i +} + +func (s *sparseSet) Get(i uint) uint { + return s.dense[i] +} + +func (s *sparseSet) Contains(ip uint) bool { + i := s.sparse[ip] + return i < s.size && s.dense[i] == ip +} + +func (s *sparseSet) Clear() { + s.size = 0 +} |