You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. package chroma
  2. import (
  3. "fmt"
  4. "os"
  5. "regexp"
  6. "strings"
  7. "sync"
  8. "time"
  9. "unicode/utf8"
  10. "github.com/dlclark/regexp2"
  11. )
  12. // A Rule is the fundamental matching unit of the Regex lexer state machine.
  13. type Rule struct {
  14. Pattern string
  15. Type Emitter
  16. Mutator Mutator
  17. }
  18. // An Emitter takes group matches and returns tokens.
  19. type Emitter interface {
  20. // Emit tokens for the given regex groups.
  21. Emit(groups []string, lexer Lexer) Iterator
  22. }
  23. // EmitterFunc is a function that is an Emitter.
  24. type EmitterFunc func(groups []string, lexer Lexer) Iterator
  25. // Emit tokens for groups.
  26. func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) }
  27. // ByGroups emits a token for each matching group in the rule's regex.
  28. func ByGroups(emitters ...Emitter) Emitter {
  29. return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
  30. iterators := make([]Iterator, 0, len(groups)-1)
  31. if len(emitters) != len(groups)-1 {
  32. iterators = append(iterators, Error.Emit(groups, lexer))
  33. // panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
  34. } else {
  35. for i, group := range groups[1:] {
  36. iterators = append(iterators, emitters[i].Emit([]string{group}, lexer))
  37. }
  38. }
  39. return Concaterator(iterators...)
  40. })
  41. }
  42. // UsingByGroup emits tokens for the matched groups in the regex using a
  43. // "sublexer". Used when lexing code blocks where the name of a sublexer is
  44. // contained within the block, for example on a Markdown text block or SQL
  45. // language block.
  46. //
  47. // The sublexer will be retrieved using sublexerGetFunc (typically
  48. // internal.Get), using the captured value from the matched sublexerNameGroup.
  49. //
  50. // If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
  51. // then tokens for the matched codeGroup will be emitted using the retrieved
  52. // lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
  53. // the passed emitter.
  54. //
  55. // Example:
  56. //
  57. // var Markdown = internal.Register(MustNewLexer(
  58. // &Config{
  59. // Name: "markdown",
  60. // Aliases: []string{"md", "mkd"},
  61. // Filenames: []string{"*.md", "*.mkd", "*.markdown"},
  62. // MimeTypes: []string{"text/x-markdown"},
  63. // },
  64. // Rules{
  65. // "root": {
  66. // {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
  67. // UsingByGroup(
  68. // internal.Get,
  69. // 2, 4,
  70. // String, String, String, Text, String,
  71. // ),
  72. // nil,
  73. // },
  74. // },
  75. // },
  76. // ))
  77. //
  78. // See the lexers/m/markdown.go for the complete example.
  79. //
  80. // Note: panic's if the number emitters does not equal the number of matched
  81. // groups in the regex.
  82. func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
  83. return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
  84. // bounds check
  85. if len(emitters) != len(groups)-1 {
  86. panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
  87. }
  88. // grab sublexer
  89. sublexer := sublexerGetFunc(groups[sublexerNameGroup])
  90. // build iterators
  91. iterators := make([]Iterator, len(groups)-1)
  92. for i, group := range groups[1:] {
  93. if i == codeGroup-1 && sublexer != nil {
  94. var err error
  95. iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
  96. if err != nil {
  97. panic(err)
  98. }
  99. } else {
  100. iterators[i] = emitters[i].Emit([]string{group}, lexer)
  101. }
  102. }
  103. return Concaterator(iterators...)
  104. })
  105. }
  106. // Using returns an Emitter that uses a given Lexer for parsing and emitting.
  107. func Using(lexer Lexer) Emitter {
  108. return EmitterFunc(func(groups []string, _ Lexer) Iterator {
  109. it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
  110. if err != nil {
  111. panic(err)
  112. }
  113. return it
  114. })
  115. }
  116. // UsingSelf is like Using, but uses the current Lexer.
  117. func UsingSelf(state string) Emitter {
  118. return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
  119. it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0])
  120. if err != nil {
  121. panic(err)
  122. }
  123. return it
  124. })
  125. }
  126. // Words creates a regex that matches any of the given literal words.
  127. func Words(prefix, suffix string, words ...string) string {
  128. for i, word := range words {
  129. words[i] = regexp.QuoteMeta(word)
  130. }
  131. return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
  132. }
  133. // Tokenise text using lexer, returning tokens as a slice.
  134. func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
  135. var out []Token
  136. it, err := lexer.Tokenise(options, text)
  137. if err != nil {
  138. return nil, err
  139. }
  140. for t := it(); t != EOF; t = it() {
  141. out = append(out, t)
  142. }
  143. return out, nil
  144. }
  145. // Rules maps from state to a sequence of Rules.
  146. type Rules map[string][]Rule
  147. // Rename clones rules then a rule.
  148. func (r Rules) Rename(old, new string) Rules {
  149. r = r.Clone()
  150. r[new] = r[old]
  151. delete(r, old)
  152. return r
  153. }
  154. // Clone returns a clone of the Rules.
  155. func (r Rules) Clone() Rules {
  156. out := map[string][]Rule{}
  157. for key, rules := range r {
  158. out[key] = make([]Rule, len(rules))
  159. copy(out[key], rules)
  160. }
  161. return out
  162. }
  163. // Merge creates a clone of "r" then merges "rules" into the clone.
  164. func (r Rules) Merge(rules Rules) Rules {
  165. out := r.Clone()
  166. for k, v := range rules.Clone() {
  167. out[k] = v
  168. }
  169. return out
  170. }
  171. // MustNewLexer creates a new Lexer or panics.
  172. func MustNewLexer(config *Config, rules Rules) *RegexLexer {
  173. lexer, err := NewLexer(config, rules)
  174. if err != nil {
  175. panic(err)
  176. }
  177. return lexer
  178. }
  179. // NewLexer creates a new regex-based Lexer.
  180. //
  181. // "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
  182. // that match input, optionally modify lexer state, and output tokens.
  183. func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
  184. if config == nil {
  185. config = &Config{}
  186. }
  187. if _, ok := rules["root"]; !ok {
  188. return nil, fmt.Errorf("no \"root\" state")
  189. }
  190. compiledRules := map[string][]*CompiledRule{}
  191. for state, rules := range rules {
  192. compiledRules[state] = nil
  193. for _, rule := range rules {
  194. flags := ""
  195. if !config.NotMultiline {
  196. flags += "m"
  197. }
  198. if config.CaseInsensitive {
  199. flags += "i"
  200. }
  201. if config.DotAll {
  202. flags += "s"
  203. }
  204. compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
  205. }
  206. }
  207. return &RegexLexer{
  208. config: config,
  209. rules: compiledRules,
  210. }, nil
  211. }
  212. // Trace enables debug tracing.
  213. func (r *RegexLexer) Trace(trace bool) *RegexLexer {
  214. r.trace = trace
  215. return r
  216. }
  217. // A CompiledRule is a Rule with a pre-compiled regex.
  218. //
  219. // Note that regular expressions are lazily compiled on first use of the lexer.
  220. type CompiledRule struct {
  221. Rule
  222. Regexp *regexp2.Regexp
  223. flags string
  224. }
  225. // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
  226. type CompiledRules map[string][]*CompiledRule
  227. // LexerState contains the state for a single lex.
  228. type LexerState struct {
  229. Lexer *RegexLexer
  230. Text []rune
  231. Pos int
  232. Rules CompiledRules
  233. Stack []string
  234. State string
  235. Rule int
  236. // Group matches.
  237. Groups []string
  238. // Custum context for mutators.
  239. MutatorContext map[interface{}]interface{}
  240. iteratorStack []Iterator
  241. options *TokeniseOptions
  242. }
  243. // Set mutator context.
  244. func (l *LexerState) Set(key interface{}, value interface{}) {
  245. l.MutatorContext[key] = value
  246. }
  247. // Get mutator context.
  248. func (l *LexerState) Get(key interface{}) interface{} {
  249. return l.MutatorContext[key]
  250. }
  251. // Iterator returns the next Token from the lexer.
  252. func (l *LexerState) Iterator() Token { // nolint: gocognit
  253. for l.Pos < len(l.Text) && len(l.Stack) > 0 {
  254. // Exhaust the iterator stack, if any.
  255. for len(l.iteratorStack) > 0 {
  256. n := len(l.iteratorStack) - 1
  257. t := l.iteratorStack[n]()
  258. if t == EOF {
  259. l.iteratorStack = l.iteratorStack[:n]
  260. continue
  261. }
  262. return t
  263. }
  264. l.State = l.Stack[len(l.Stack)-1]
  265. if l.Lexer.trace {
  266. fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
  267. }
  268. selectedRule, ok := l.Rules[l.State]
  269. if !ok {
  270. panic("unknown state " + l.State)
  271. }
  272. ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule)
  273. // No match.
  274. if groups == nil {
  275. // From Pygments :\
  276. //
  277. // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
  278. // emptied and the lexer continues scanning in the 'root' state. This can help producing
  279. // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
  280. // closed.
  281. if l.Text[l.Pos] == '\n' && l.State != l.options.State {
  282. l.Stack = []string{l.options.State}
  283. continue
  284. }
  285. l.Pos++
  286. return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
  287. }
  288. l.Rule = ruleIndex
  289. l.Groups = groups
  290. l.Pos += utf8.RuneCountInString(groups[0])
  291. if rule.Mutator != nil {
  292. if err := rule.Mutator.Mutate(l); err != nil {
  293. panic(err)
  294. }
  295. }
  296. if rule.Type != nil {
  297. l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer))
  298. }
  299. }
  300. // Exhaust the IteratorStack, if any.
  301. // Duplicate code, but eh.
  302. for len(l.iteratorStack) > 0 {
  303. n := len(l.iteratorStack) - 1
  304. t := l.iteratorStack[n]()
  305. if t == EOF {
  306. l.iteratorStack = l.iteratorStack[:n]
  307. continue
  308. }
  309. return t
  310. }
  311. // If we get to here and we still have text, return it as an error.
  312. if l.Pos != len(l.Text) && len(l.Stack) == 0 {
  313. value := string(l.Text[l.Pos:])
  314. l.Pos = len(l.Text)
  315. return Token{Type: Error, Value: value}
  316. }
  317. return EOF
  318. }
  319. // RegexLexer is the default lexer implementation used in Chroma.
  320. type RegexLexer struct {
  321. config *Config
  322. analyser func(text string) float32
  323. trace bool
  324. mu sync.Mutex
  325. compiled bool
  326. rules map[string][]*CompiledRule
  327. }
  328. // SetAnalyser sets the analyser function used to perform content inspection.
  329. func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
  330. r.analyser = analyser
  331. return r
  332. }
  333. func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
  334. if r.analyser != nil {
  335. return r.analyser(text)
  336. }
  337. return 0.0
  338. }
  339. func (r *RegexLexer) Config() *Config { // nolint
  340. return r.config
  341. }
  342. // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
  343. func (r *RegexLexer) maybeCompile() (err error) {
  344. r.mu.Lock()
  345. defer r.mu.Unlock()
  346. if r.compiled {
  347. return nil
  348. }
  349. for state, rules := range r.rules {
  350. for i, rule := range rules {
  351. if rule.Regexp == nil {
  352. pattern := "(?:" + rule.Pattern + ")"
  353. if rule.flags != "" {
  354. pattern = "(?" + rule.flags + ")" + pattern
  355. }
  356. pattern = `\G` + pattern
  357. rule.Regexp, err = regexp2.Compile(pattern, 0)
  358. if err != nil {
  359. return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
  360. }
  361. rule.Regexp.MatchTimeout = time.Millisecond * 250
  362. }
  363. }
  364. }
  365. restart:
  366. seen := map[LexerMutator]bool{}
  367. for state := range r.rules {
  368. for i := 0; i < len(r.rules[state]); i++ {
  369. rule := r.rules[state][i]
  370. if compile, ok := rule.Mutator.(LexerMutator); ok {
  371. if seen[compile] {
  372. return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
  373. }
  374. seen[compile] = true
  375. if err := compile.MutateLexer(r.rules, state, i); err != nil {
  376. return err
  377. }
  378. // Process the rules again in case the mutator added/removed rules.
  379. //
  380. // This sounds bad, but shouldn't be significant in practice.
  381. goto restart
  382. }
  383. }
  384. }
  385. r.compiled = true
  386. return nil
  387. }
  388. func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
  389. if err := r.maybeCompile(); err != nil {
  390. return nil, err
  391. }
  392. if options == nil {
  393. options = defaultOptions
  394. }
  395. if options.EnsureLF {
  396. text = ensureLF(text)
  397. }
  398. if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
  399. text += "\n"
  400. }
  401. state := &LexerState{
  402. options: options,
  403. Lexer: r,
  404. Text: []rune(text),
  405. Stack: []string{options.State},
  406. Rules: r.rules,
  407. MutatorContext: map[interface{}]interface{}{},
  408. }
  409. return state.Iterator, nil
  410. }
  411. func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) {
  412. for i, rule := range rules {
  413. match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
  414. if match != nil && err == nil && match.Index == pos {
  415. groups := []string{}
  416. for _, g := range match.Groups() {
  417. groups = append(groups, g.String())
  418. }
  419. return i, rule, groups
  420. }
  421. }
  422. return 0, &CompiledRule{}, nil
  423. }
  424. // replace \r and \r\n with \n
  425. // same as strings.ReplaceAll but more efficient
  426. func ensureLF(text string) string {
  427. buf := make([]byte, len(text))
  428. var j int
  429. for i := 0; i < len(text); i++ {
  430. c := text[i]
  431. if c == '\r' {
  432. if i < len(text)-1 && text[i+1] == '\n' {
  433. continue
  434. }
  435. c = '\n'
  436. }
  437. buf[j] = c
  438. j++
  439. }
  440. return string(buf[:j])
  441. }