123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480 |
- package chroma
-
- import (
- "fmt"
- "os"
- "regexp"
- "strings"
- "sync"
- "time"
- "unicode/utf8"
-
- "github.com/dlclark/regexp2"
- )
-
- // A Rule is the fundamental matching unit of the Regex lexer state machine.
- type Rule struct {
- Pattern string
- Type Emitter
- Mutator Mutator
- }
-
- // An Emitter takes group matches and returns tokens.
- type Emitter interface {
- // Emit tokens for the given regex groups.
- Emit(groups []string, lexer Lexer) Iterator
- }
-
- // EmitterFunc is a function that is an Emitter.
- type EmitterFunc func(groups []string, lexer Lexer) Iterator
-
- // Emit tokens for groups.
- func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) }
-
- // ByGroups emits a token for each matching group in the rule's regex.
- func ByGroups(emitters ...Emitter) Emitter {
- return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
- iterators := make([]Iterator, 0, len(groups)-1)
- if len(emitters) != len(groups)-1 {
- iterators = append(iterators, Error.Emit(groups, lexer))
- // panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
- } else {
- for i, group := range groups[1:] {
- iterators = append(iterators, emitters[i].Emit([]string{group}, lexer))
- }
- }
- return Concaterator(iterators...)
- })
- }
-
- // UsingByGroup emits tokens for the matched groups in the regex using a
- // "sublexer". Used when lexing code blocks where the name of a sublexer is
- // contained within the block, for example on a Markdown text block or SQL
- // language block.
- //
- // The sublexer will be retrieved using sublexerGetFunc (typically
- // internal.Get), using the captured value from the matched sublexerNameGroup.
- //
- // If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
- // then tokens for the matched codeGroup will be emitted using the retrieved
- // lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
- // the passed emitter.
- //
- // Example:
- //
- // var Markdown = internal.Register(MustNewLexer(
- // &Config{
- // Name: "markdown",
- // Aliases: []string{"md", "mkd"},
- // Filenames: []string{"*.md", "*.mkd", "*.markdown"},
- // MimeTypes: []string{"text/x-markdown"},
- // },
- // Rules{
- // "root": {
- // {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
- // UsingByGroup(
- // internal.Get,
- // 2, 4,
- // String, String, String, Text, String,
- // ),
- // nil,
- // },
- // },
- // },
- // ))
- //
- // See the lexers/m/markdown.go for the complete example.
- //
- // Note: panic's if the number emitters does not equal the number of matched
- // groups in the regex.
- func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
- return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
- // bounds check
- if len(emitters) != len(groups)-1 {
- panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
- }
-
- // grab sublexer
- sublexer := sublexerGetFunc(groups[sublexerNameGroup])
-
- // build iterators
- iterators := make([]Iterator, len(groups)-1)
- for i, group := range groups[1:] {
- if i == codeGroup-1 && sublexer != nil {
- var err error
- iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
- if err != nil {
- panic(err)
- }
- } else {
- iterators[i] = emitters[i].Emit([]string{group}, lexer)
- }
- }
-
- return Concaterator(iterators...)
- })
- }
-
- // Using returns an Emitter that uses a given Lexer for parsing and emitting.
- func Using(lexer Lexer) Emitter {
- return EmitterFunc(func(groups []string, _ Lexer) Iterator {
- it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
- if err != nil {
- panic(err)
- }
- return it
- })
- }
-
- // UsingSelf is like Using, but uses the current Lexer.
- func UsingSelf(state string) Emitter {
- return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
- it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0])
- if err != nil {
- panic(err)
- }
- return it
- })
- }
-
- // Words creates a regex that matches any of the given literal words.
- func Words(prefix, suffix string, words ...string) string {
- for i, word := range words {
- words[i] = regexp.QuoteMeta(word)
- }
- return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
- }
-
- // Tokenise text using lexer, returning tokens as a slice.
- func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
- var out []Token
- it, err := lexer.Tokenise(options, text)
- if err != nil {
- return nil, err
- }
- for t := it(); t != EOF; t = it() {
- out = append(out, t)
- }
- return out, nil
- }
-
- // Rules maps from state to a sequence of Rules.
- type Rules map[string][]Rule
-
- // Rename clones rules then a rule.
- func (r Rules) Rename(old, new string) Rules {
- r = r.Clone()
- r[new] = r[old]
- delete(r, old)
- return r
- }
-
- // Clone returns a clone of the Rules.
- func (r Rules) Clone() Rules {
- out := map[string][]Rule{}
- for key, rules := range r {
- out[key] = make([]Rule, len(rules))
- copy(out[key], rules)
- }
- return out
- }
-
- // Merge creates a clone of "r" then merges "rules" into the clone.
- func (r Rules) Merge(rules Rules) Rules {
- out := r.Clone()
- for k, v := range rules.Clone() {
- out[k] = v
- }
- return out
- }
-
- // MustNewLexer creates a new Lexer or panics.
- func MustNewLexer(config *Config, rules Rules) *RegexLexer {
- lexer, err := NewLexer(config, rules)
- if err != nil {
- panic(err)
- }
- return lexer
- }
-
- // NewLexer creates a new regex-based Lexer.
- //
- // "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
- // that match input, optionally modify lexer state, and output tokens.
- func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
- if config == nil {
- config = &Config{}
- }
- if _, ok := rules["root"]; !ok {
- return nil, fmt.Errorf("no \"root\" state")
- }
- compiledRules := map[string][]*CompiledRule{}
- for state, rules := range rules {
- compiledRules[state] = nil
- for _, rule := range rules {
- flags := ""
- if !config.NotMultiline {
- flags += "m"
- }
- if config.CaseInsensitive {
- flags += "i"
- }
- if config.DotAll {
- flags += "s"
- }
- compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
- }
- }
- return &RegexLexer{
- config: config,
- rules: compiledRules,
- }, nil
- }
-
- // Trace enables debug tracing.
- func (r *RegexLexer) Trace(trace bool) *RegexLexer {
- r.trace = trace
- return r
- }
-
- // A CompiledRule is a Rule with a pre-compiled regex.
- //
- // Note that regular expressions are lazily compiled on first use of the lexer.
- type CompiledRule struct {
- Rule
- Regexp *regexp2.Regexp
- flags string
- }
-
- // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
- type CompiledRules map[string][]*CompiledRule
-
- // LexerState contains the state for a single lex.
- type LexerState struct {
- Lexer *RegexLexer
- Text []rune
- Pos int
- Rules CompiledRules
- Stack []string
- State string
- Rule int
- // Group matches.
- Groups []string
- // Custum context for mutators.
- MutatorContext map[interface{}]interface{}
- iteratorStack []Iterator
- options *TokeniseOptions
- }
-
- // Set mutator context.
- func (l *LexerState) Set(key interface{}, value interface{}) {
- l.MutatorContext[key] = value
- }
-
- // Get mutator context.
- func (l *LexerState) Get(key interface{}) interface{} {
- return l.MutatorContext[key]
- }
-
- // Iterator returns the next Token from the lexer.
- func (l *LexerState) Iterator() Token { // nolint: gocognit
- for l.Pos < len(l.Text) && len(l.Stack) > 0 {
- // Exhaust the iterator stack, if any.
- for len(l.iteratorStack) > 0 {
- n := len(l.iteratorStack) - 1
- t := l.iteratorStack[n]()
- if t == EOF {
- l.iteratorStack = l.iteratorStack[:n]
- continue
- }
- return t
- }
-
- l.State = l.Stack[len(l.Stack)-1]
- if l.Lexer.trace {
- fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
- }
- selectedRule, ok := l.Rules[l.State]
- if !ok {
- panic("unknown state " + l.State)
- }
- ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule)
- // No match.
- if groups == nil {
- // From Pygments :\
- //
- // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
- // emptied and the lexer continues scanning in the 'root' state. This can help producing
- // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
- // closed.
- if l.Text[l.Pos] == '\n' && l.State != l.options.State {
- l.Stack = []string{l.options.State}
- continue
- }
- l.Pos++
- return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
- }
- l.Rule = ruleIndex
- l.Groups = groups
- l.Pos += utf8.RuneCountInString(groups[0])
- if rule.Mutator != nil {
- if err := rule.Mutator.Mutate(l); err != nil {
- panic(err)
- }
- }
- if rule.Type != nil {
- l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer))
- }
- }
- // Exhaust the IteratorStack, if any.
- // Duplicate code, but eh.
- for len(l.iteratorStack) > 0 {
- n := len(l.iteratorStack) - 1
- t := l.iteratorStack[n]()
- if t == EOF {
- l.iteratorStack = l.iteratorStack[:n]
- continue
- }
- return t
- }
-
- // If we get to here and we still have text, return it as an error.
- if l.Pos != len(l.Text) && len(l.Stack) == 0 {
- value := string(l.Text[l.Pos:])
- l.Pos = len(l.Text)
- return Token{Type: Error, Value: value}
- }
- return EOF
- }
-
- // RegexLexer is the default lexer implementation used in Chroma.
- type RegexLexer struct {
- config *Config
- analyser func(text string) float32
- trace bool
-
- mu sync.Mutex
- compiled bool
- rules map[string][]*CompiledRule
- }
-
- // SetAnalyser sets the analyser function used to perform content inspection.
- func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
- r.analyser = analyser
- return r
- }
-
- func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
- if r.analyser != nil {
- return r.analyser(text)
- }
- return 0.0
- }
-
- func (r *RegexLexer) Config() *Config { // nolint
- return r.config
- }
-
- // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
- func (r *RegexLexer) maybeCompile() (err error) {
- r.mu.Lock()
- defer r.mu.Unlock()
- if r.compiled {
- return nil
- }
- for state, rules := range r.rules {
- for i, rule := range rules {
- if rule.Regexp == nil {
- pattern := "(?:" + rule.Pattern + ")"
- if rule.flags != "" {
- pattern = "(?" + rule.flags + ")" + pattern
- }
- pattern = `\G` + pattern
- rule.Regexp, err = regexp2.Compile(pattern, 0)
- if err != nil {
- return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
- }
- rule.Regexp.MatchTimeout = time.Millisecond * 250
- }
- }
- }
- restart:
- seen := map[LexerMutator]bool{}
- for state := range r.rules {
- for i := 0; i < len(r.rules[state]); i++ {
- rule := r.rules[state][i]
- if compile, ok := rule.Mutator.(LexerMutator); ok {
- if seen[compile] {
- return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
- }
- seen[compile] = true
- if err := compile.MutateLexer(r.rules, state, i); err != nil {
- return err
- }
- // Process the rules again in case the mutator added/removed rules.
- //
- // This sounds bad, but shouldn't be significant in practice.
- goto restart
- }
- }
- }
- r.compiled = true
- return nil
- }
-
- func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
- if err := r.maybeCompile(); err != nil {
- return nil, err
- }
- if options == nil {
- options = defaultOptions
- }
- if options.EnsureLF {
- text = ensureLF(text)
- }
- if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
- text += "\n"
- }
- state := &LexerState{
- options: options,
- Lexer: r,
- Text: []rune(text),
- Stack: []string{options.State},
- Rules: r.rules,
- MutatorContext: map[interface{}]interface{}{},
- }
- return state.Iterator, nil
- }
-
- func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) {
- for i, rule := range rules {
- match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
- if match != nil && err == nil && match.Index == pos {
- groups := []string{}
- for _, g := range match.Groups() {
- groups = append(groups, g.String())
- }
- return i, rule, groups
- }
- }
- return 0, &CompiledRule{}, nil
- }
-
- // replace \r and \r\n with \n
- // same as strings.ReplaceAll but more efficient
- func ensureLF(text string) string {
- buf := make([]byte, len(text))
- var j int
- for i := 0; i < len(text); i++ {
- c := text[i]
- if c == '\r' {
- if i < len(text)-1 && text[i+1] == '\n' {
- continue
- }
- c = '\n'
- }
- buf[j] = c
- j++
- }
- return string(buf[:j])
- }
|