diff options
Diffstat (limited to 'vendor/github.com/couchbase/vellum/regexp/dfa.go')
-rw-r--r-- | vendor/github.com/couchbase/vellum/regexp/dfa.go | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/vendor/github.com/couchbase/vellum/regexp/dfa.go b/vendor/github.com/couchbase/vellum/regexp/dfa.go new file mode 100644 index 0000000000..9864606b6a --- /dev/null +++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go @@ -0,0 +1,188 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regexp + +import ( + "encoding/binary" + "fmt" +) + +// StateLimit is the maximum number of states allowed +const StateLimit = 10000 + +// ErrTooManyStates is returned if you attempt to build a Levenshtein +// automaton which requries too many states. +var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", + StateLimit) + +type dfaBuilder struct { + dfa *dfa + cache map[string]int + keyBuf []byte +} + +func newDfaBuilder(insts prog) *dfaBuilder { + d := &dfaBuilder{ + dfa: &dfa{ + insts: insts, + states: make([]*state, 0, 16), + }, + cache: make(map[string]int, 1024), + } + // add 0 state that is invalid + d.dfa.states = append(d.dfa.states, &state{ + next: make([]int, 256), + match: false, + }) + return d +} + +func (d *dfaBuilder) build() (*dfa, error) { + cur := newSparseSet(uint(len(d.dfa.insts))) + next := newSparseSet(uint(len(d.dfa.insts))) + + d.dfa.add(cur, 0) + states := intStack{d.cachedState(cur)} + seen := make(map[int]struct{}) + var s int + states, s = states.Pop() + for s != 0 { + for b := 0; b < 256; b++ { + ns := d.runState(cur, next, s, byte(b)) + if ns != 0 { + if _, ok := seen[ns]; !ok { + seen[ns] = struct{}{} + states = states.Push(ns) + } + } + if len(d.dfa.states) > StateLimit { + return nil, ErrTooManyStates + } + } + states, s = states.Pop() + } + return d.dfa, nil +} + +func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int { + cur.Clear() + for _, ip := range d.dfa.states[state].insts { + cur.Add(ip) + } + d.dfa.run(cur, next, b) + nextState := d.cachedState(next) + d.dfa.states[state].next[b] = nextState + return nextState +} + +func instsKey(insts []uint, buf []byte) []byte { + if cap(buf) < 8*len(insts) { + buf = make([]byte, 8*len(insts)) + } else { + buf = buf[0 : 8*len(insts)] + } + for i, inst := range insts { + binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst)) + } + return buf +} + +func (d *dfaBuilder) cachedState(set *sparseSet) int { + var insts []uint + var isMatch bool + for i := uint(0); i < uint(set.Len()); i++ { + ip := set.Get(i) + switch d.dfa.insts[ip].op { + case OpRange: + insts = append(insts, ip) + case OpMatch: + isMatch = true + insts = append(insts, ip) + } + } + if len(insts) == 0 { + return 0 + } + d.keyBuf = instsKey(insts, d.keyBuf) + v, ok := d.cache[string(d.keyBuf)] + if ok { + return v + } + d.dfa.states = append(d.dfa.states, &state{ + insts: insts, + next: make([]int, 256), + match: isMatch, + }) + newV := len(d.dfa.states) - 1 + d.cache[string(d.keyBuf)] = newV + return newV +} + +type dfa struct { + insts prog + states []*state +} + +func (d *dfa) add(set *sparseSet, ip uint) { + if set.Contains(ip) { + return + } + set.Add(ip) + switch d.insts[ip].op { + case OpJmp: + d.add(set, d.insts[ip].to) + case OpSplit: + d.add(set, d.insts[ip].splitA) + d.add(set, d.insts[ip].splitB) + } +} + +func (d *dfa) run(from, to *sparseSet, b byte) bool { + to.Clear() + var isMatch bool + for i := uint(0); i < uint(from.Len()); i++ { + ip := from.Get(i) + switch d.insts[ip].op { + case OpMatch: + isMatch = true + case OpRange: + if d.insts[ip].rangeStart <= b && + b <= d.insts[ip].rangeEnd { + d.add(to, ip+1) + } + } + } + return isMatch +} + +type state struct { + insts []uint + next []int + match bool +} + +type intStack []int + +func (s intStack) Push(v int) intStack { + return append(s, v) +} + +func (s intStack) Pop() (intStack, int) { + l := len(s) + if l < 1 { + return s, 0 + } + return s[:l-1], s[l-1] +} |