You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.go 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. // TOML lexer.
  2. //
  3. // Written using the principles developed by Rob Pike in
  4. // http://www.youtube.com/watch?v=HxaD_trXwRE
  5. package toml
  6. import (
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "regexp"
  11. "strconv"
  12. "strings"
  13. )
  14. var dateRegexp *regexp.Regexp
  15. // Define state functions
  16. type tomlLexStateFn func() tomlLexStateFn
  17. // Define lexer
  18. type tomlLexer struct {
  19. inputIdx int
  20. input []rune // Textual source
  21. currentTokenStart int
  22. currentTokenStop int
  23. tokens []token
  24. depth int
  25. line int
  26. col int
  27. endbufferLine int
  28. endbufferCol int
  29. }
  30. // Basic read operations on input
  31. func (l *tomlLexer) read() rune {
  32. r := l.peek()
  33. if r == '\n' {
  34. l.endbufferLine++
  35. l.endbufferCol = 1
  36. } else {
  37. l.endbufferCol++
  38. }
  39. l.inputIdx++
  40. return r
  41. }
  42. func (l *tomlLexer) next() rune {
  43. r := l.read()
  44. if r != eof {
  45. l.currentTokenStop++
  46. }
  47. return r
  48. }
  49. func (l *tomlLexer) ignore() {
  50. l.currentTokenStart = l.currentTokenStop
  51. l.line = l.endbufferLine
  52. l.col = l.endbufferCol
  53. }
  54. func (l *tomlLexer) skip() {
  55. l.next()
  56. l.ignore()
  57. }
  58. func (l *tomlLexer) fastForward(n int) {
  59. for i := 0; i < n; i++ {
  60. l.next()
  61. }
  62. }
  63. func (l *tomlLexer) emitWithValue(t tokenType, value string) {
  64. l.tokens = append(l.tokens, token{
  65. Position: Position{l.line, l.col},
  66. typ: t,
  67. val: value,
  68. })
  69. l.ignore()
  70. }
  71. func (l *tomlLexer) emit(t tokenType) {
  72. l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
  73. }
  74. func (l *tomlLexer) peek() rune {
  75. if l.inputIdx >= len(l.input) {
  76. return eof
  77. }
  78. return l.input[l.inputIdx]
  79. }
  80. func (l *tomlLexer) peekString(size int) string {
  81. maxIdx := len(l.input)
  82. upperIdx := l.inputIdx + size // FIXME: potential overflow
  83. if upperIdx > maxIdx {
  84. upperIdx = maxIdx
  85. }
  86. return string(l.input[l.inputIdx:upperIdx])
  87. }
  88. func (l *tomlLexer) follow(next string) bool {
  89. return next == l.peekString(len(next))
  90. }
  91. // Error management
  92. func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
  93. l.tokens = append(l.tokens, token{
  94. Position: Position{l.line, l.col},
  95. typ: tokenError,
  96. val: fmt.Sprintf(format, args...),
  97. })
  98. return nil
  99. }
  100. // State functions
  101. func (l *tomlLexer) lexVoid() tomlLexStateFn {
  102. for {
  103. next := l.peek()
  104. switch next {
  105. case '[':
  106. return l.lexTableKey
  107. case '#':
  108. return l.lexComment(l.lexVoid)
  109. case '=':
  110. return l.lexEqual
  111. case '\r':
  112. fallthrough
  113. case '\n':
  114. l.skip()
  115. continue
  116. }
  117. if isSpace(next) {
  118. l.skip()
  119. }
  120. if l.depth > 0 {
  121. return l.lexRvalue
  122. }
  123. if isKeyStartChar(next) {
  124. return l.lexKey
  125. }
  126. if next == eof {
  127. l.next()
  128. break
  129. }
  130. }
  131. l.emit(tokenEOF)
  132. return nil
  133. }
  134. func (l *tomlLexer) lexRvalue() tomlLexStateFn {
  135. for {
  136. next := l.peek()
  137. switch next {
  138. case '.':
  139. return l.errorf("cannot start float with a dot")
  140. case '=':
  141. return l.lexEqual
  142. case '[':
  143. l.depth++
  144. return l.lexLeftBracket
  145. case ']':
  146. l.depth--
  147. return l.lexRightBracket
  148. case '{':
  149. return l.lexLeftCurlyBrace
  150. case '}':
  151. return l.lexRightCurlyBrace
  152. case '#':
  153. return l.lexComment(l.lexRvalue)
  154. case '"':
  155. return l.lexString
  156. case '\'':
  157. return l.lexLiteralString
  158. case ',':
  159. return l.lexComma
  160. case '\r':
  161. fallthrough
  162. case '\n':
  163. l.skip()
  164. if l.depth == 0 {
  165. return l.lexVoid
  166. }
  167. return l.lexRvalue
  168. case '_':
  169. return l.errorf("cannot start number with underscore")
  170. }
  171. if l.follow("true") {
  172. return l.lexTrue
  173. }
  174. if l.follow("false") {
  175. return l.lexFalse
  176. }
  177. if l.follow("inf") {
  178. return l.lexInf
  179. }
  180. if l.follow("nan") {
  181. return l.lexNan
  182. }
  183. if isSpace(next) {
  184. l.skip()
  185. continue
  186. }
  187. if next == eof {
  188. l.next()
  189. break
  190. }
  191. possibleDate := l.peekString(35)
  192. dateMatch := dateRegexp.FindString(possibleDate)
  193. if dateMatch != "" {
  194. l.fastForward(len(dateMatch))
  195. return l.lexDate
  196. }
  197. if next == '+' || next == '-' || isDigit(next) {
  198. return l.lexNumber
  199. }
  200. if isAlphanumeric(next) {
  201. return l.lexKey
  202. }
  203. return l.errorf("no value can start with %c", next)
  204. }
  205. l.emit(tokenEOF)
  206. return nil
  207. }
  208. func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
  209. l.next()
  210. l.emit(tokenLeftCurlyBrace)
  211. return l.lexRvalue
  212. }
  213. func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
  214. l.next()
  215. l.emit(tokenRightCurlyBrace)
  216. return l.lexRvalue
  217. }
  218. func (l *tomlLexer) lexDate() tomlLexStateFn {
  219. l.emit(tokenDate)
  220. return l.lexRvalue
  221. }
  222. func (l *tomlLexer) lexTrue() tomlLexStateFn {
  223. l.fastForward(4)
  224. l.emit(tokenTrue)
  225. return l.lexRvalue
  226. }
  227. func (l *tomlLexer) lexFalse() tomlLexStateFn {
  228. l.fastForward(5)
  229. l.emit(tokenFalse)
  230. return l.lexRvalue
  231. }
  232. func (l *tomlLexer) lexInf() tomlLexStateFn {
  233. l.fastForward(3)
  234. l.emit(tokenInf)
  235. return l.lexRvalue
  236. }
  237. func (l *tomlLexer) lexNan() tomlLexStateFn {
  238. l.fastForward(3)
  239. l.emit(tokenNan)
  240. return l.lexRvalue
  241. }
  242. func (l *tomlLexer) lexEqual() tomlLexStateFn {
  243. l.next()
  244. l.emit(tokenEqual)
  245. return l.lexRvalue
  246. }
  247. func (l *tomlLexer) lexComma() tomlLexStateFn {
  248. l.next()
  249. l.emit(tokenComma)
  250. return l.lexRvalue
  251. }
  252. // Parse the key and emits its value without escape sequences.
  253. // bare keys, basic string keys and literal string keys are supported.
  254. func (l *tomlLexer) lexKey() tomlLexStateFn {
  255. growingString := ""
  256. for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
  257. if r == '"' {
  258. l.next()
  259. str, err := l.lexStringAsString(`"`, false, true)
  260. if err != nil {
  261. return l.errorf(err.Error())
  262. }
  263. growingString += "\"" + str + "\""
  264. l.next()
  265. continue
  266. } else if r == '\'' {
  267. l.next()
  268. str, err := l.lexLiteralStringAsString(`'`, false)
  269. if err != nil {
  270. return l.errorf(err.Error())
  271. }
  272. growingString += "'" + str + "'"
  273. l.next()
  274. continue
  275. } else if r == '\n' {
  276. return l.errorf("keys cannot contain new lines")
  277. } else if isSpace(r) {
  278. break
  279. } else if r == '.' {
  280. // skip
  281. } else if !isValidBareChar(r) {
  282. return l.errorf("keys cannot contain %c character", r)
  283. }
  284. growingString += string(r)
  285. l.next()
  286. }
  287. l.emitWithValue(tokenKey, growingString)
  288. return l.lexVoid
  289. }
  290. func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
  291. return func() tomlLexStateFn {
  292. for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
  293. if next == '\r' && l.follow("\r\n") {
  294. break
  295. }
  296. l.next()
  297. }
  298. l.ignore()
  299. return previousState
  300. }
  301. }
  302. func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
  303. l.next()
  304. l.emit(tokenLeftBracket)
  305. return l.lexRvalue
  306. }
  307. func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
  308. growingString := ""
  309. if discardLeadingNewLine {
  310. if l.follow("\r\n") {
  311. l.skip()
  312. l.skip()
  313. } else if l.peek() == '\n' {
  314. l.skip()
  315. }
  316. }
  317. // find end of string
  318. for {
  319. if l.follow(terminator) {
  320. return growingString, nil
  321. }
  322. next := l.peek()
  323. if next == eof {
  324. break
  325. }
  326. growingString += string(l.next())
  327. }
  328. return "", errors.New("unclosed string")
  329. }
  330. func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
  331. l.skip()
  332. // handle special case for triple-quote
  333. terminator := "'"
  334. discardLeadingNewLine := false
  335. if l.follow("''") {
  336. l.skip()
  337. l.skip()
  338. terminator = "'''"
  339. discardLeadingNewLine = true
  340. }
  341. str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
  342. if err != nil {
  343. return l.errorf(err.Error())
  344. }
  345. l.emitWithValue(tokenString, str)
  346. l.fastForward(len(terminator))
  347. l.ignore()
  348. return l.lexRvalue
  349. }
  350. // Lex a string and return the results as a string.
  351. // Terminator is the substring indicating the end of the token.
  352. // The resulting string does not include the terminator.
  353. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
  354. growingString := ""
  355. if discardLeadingNewLine {
  356. if l.follow("\r\n") {
  357. l.skip()
  358. l.skip()
  359. } else if l.peek() == '\n' {
  360. l.skip()
  361. }
  362. }
  363. for {
  364. if l.follow(terminator) {
  365. return growingString, nil
  366. }
  367. if l.follow("\\") {
  368. l.next()
  369. switch l.peek() {
  370. case '\r':
  371. fallthrough
  372. case '\n':
  373. fallthrough
  374. case '\t':
  375. fallthrough
  376. case ' ':
  377. // skip all whitespace chars following backslash
  378. for strings.ContainsRune("\r\n\t ", l.peek()) {
  379. l.next()
  380. }
  381. case '"':
  382. growingString += "\""
  383. l.next()
  384. case 'n':
  385. growingString += "\n"
  386. l.next()
  387. case 'b':
  388. growingString += "\b"
  389. l.next()
  390. case 'f':
  391. growingString += "\f"
  392. l.next()
  393. case '/':
  394. growingString += "/"
  395. l.next()
  396. case 't':
  397. growingString += "\t"
  398. l.next()
  399. case 'r':
  400. growingString += "\r"
  401. l.next()
  402. case '\\':
  403. growingString += "\\"
  404. l.next()
  405. case 'u':
  406. l.next()
  407. code := ""
  408. for i := 0; i < 4; i++ {
  409. c := l.peek()
  410. if !isHexDigit(c) {
  411. return "", errors.New("unfinished unicode escape")
  412. }
  413. l.next()
  414. code = code + string(c)
  415. }
  416. intcode, err := strconv.ParseInt(code, 16, 32)
  417. if err != nil {
  418. return "", errors.New("invalid unicode escape: \\u" + code)
  419. }
  420. growingString += string(rune(intcode))
  421. case 'U':
  422. l.next()
  423. code := ""
  424. for i := 0; i < 8; i++ {
  425. c := l.peek()
  426. if !isHexDigit(c) {
  427. return "", errors.New("unfinished unicode escape")
  428. }
  429. l.next()
  430. code = code + string(c)
  431. }
  432. intcode, err := strconv.ParseInt(code, 16, 64)
  433. if err != nil {
  434. return "", errors.New("invalid unicode escape: \\U" + code)
  435. }
  436. growingString += string(rune(intcode))
  437. default:
  438. return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
  439. }
  440. } else {
  441. r := l.peek()
  442. if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
  443. return "", fmt.Errorf("unescaped control character %U", r)
  444. }
  445. l.next()
  446. growingString += string(r)
  447. }
  448. if l.peek() == eof {
  449. break
  450. }
  451. }
  452. return "", errors.New("unclosed string")
  453. }
  454. func (l *tomlLexer) lexString() tomlLexStateFn {
  455. l.skip()
  456. // handle special case for triple-quote
  457. terminator := `"`
  458. discardLeadingNewLine := false
  459. acceptNewLines := false
  460. if l.follow(`""`) {
  461. l.skip()
  462. l.skip()
  463. terminator = `"""`
  464. discardLeadingNewLine = true
  465. acceptNewLines = true
  466. }
  467. str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
  468. if err != nil {
  469. return l.errorf(err.Error())
  470. }
  471. l.emitWithValue(tokenString, str)
  472. l.fastForward(len(terminator))
  473. l.ignore()
  474. return l.lexRvalue
  475. }
  476. func (l *tomlLexer) lexTableKey() tomlLexStateFn {
  477. l.next()
  478. if l.peek() == '[' {
  479. // token '[[' signifies an array of tables
  480. l.next()
  481. l.emit(tokenDoubleLeftBracket)
  482. return l.lexInsideTableArrayKey
  483. }
  484. // vanilla table key
  485. l.emit(tokenLeftBracket)
  486. return l.lexInsideTableKey
  487. }
  488. // Parse the key till "]]", but only bare keys are supported
  489. func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
  490. for r := l.peek(); r != eof; r = l.peek() {
  491. switch r {
  492. case ']':
  493. if l.currentTokenStop > l.currentTokenStart {
  494. l.emit(tokenKeyGroupArray)
  495. }
  496. l.next()
  497. if l.peek() != ']' {
  498. break
  499. }
  500. l.next()
  501. l.emit(tokenDoubleRightBracket)
  502. return l.lexVoid
  503. case '[':
  504. return l.errorf("table array key cannot contain ']'")
  505. default:
  506. l.next()
  507. }
  508. }
  509. return l.errorf("unclosed table array key")
  510. }
  511. // Parse the key till "]" but only bare keys are supported
  512. func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
  513. for r := l.peek(); r != eof; r = l.peek() {
  514. switch r {
  515. case ']':
  516. if l.currentTokenStop > l.currentTokenStart {
  517. l.emit(tokenKeyGroup)
  518. }
  519. l.next()
  520. l.emit(tokenRightBracket)
  521. return l.lexVoid
  522. case '[':
  523. return l.errorf("table key cannot contain ']'")
  524. default:
  525. l.next()
  526. }
  527. }
  528. return l.errorf("unclosed table key")
  529. }
  530. func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
  531. l.next()
  532. l.emit(tokenRightBracket)
  533. return l.lexRvalue
  534. }
  535. type validRuneFn func(r rune) bool
  536. func isValidHexRune(r rune) bool {
  537. return r >= 'a' && r <= 'f' ||
  538. r >= 'A' && r <= 'F' ||
  539. r >= '0' && r <= '9' ||
  540. r == '_'
  541. }
  542. func isValidOctalRune(r rune) bool {
  543. return r >= '0' && r <= '7' || r == '_'
  544. }
  545. func isValidBinaryRune(r rune) bool {
  546. return r == '0' || r == '1' || r == '_'
  547. }
  548. func (l *tomlLexer) lexNumber() tomlLexStateFn {
  549. r := l.peek()
  550. if r == '0' {
  551. follow := l.peekString(2)
  552. if len(follow) == 2 {
  553. var isValidRune validRuneFn
  554. switch follow[1] {
  555. case 'x':
  556. isValidRune = isValidHexRune
  557. case 'o':
  558. isValidRune = isValidOctalRune
  559. case 'b':
  560. isValidRune = isValidBinaryRune
  561. default:
  562. if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
  563. return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
  564. }
  565. }
  566. if isValidRune != nil {
  567. l.next()
  568. l.next()
  569. digitSeen := false
  570. for {
  571. next := l.peek()
  572. if !isValidRune(next) {
  573. break
  574. }
  575. digitSeen = true
  576. l.next()
  577. }
  578. if !digitSeen {
  579. return l.errorf("number needs at least one digit")
  580. }
  581. l.emit(tokenInteger)
  582. return l.lexRvalue
  583. }
  584. }
  585. }
  586. if r == '+' || r == '-' {
  587. l.next()
  588. if l.follow("inf") {
  589. return l.lexInf
  590. }
  591. if l.follow("nan") {
  592. return l.lexNan
  593. }
  594. }
  595. pointSeen := false
  596. expSeen := false
  597. digitSeen := false
  598. for {
  599. next := l.peek()
  600. if next == '.' {
  601. if pointSeen {
  602. return l.errorf("cannot have two dots in one float")
  603. }
  604. l.next()
  605. if !isDigit(l.peek()) {
  606. return l.errorf("float cannot end with a dot")
  607. }
  608. pointSeen = true
  609. } else if next == 'e' || next == 'E' {
  610. expSeen = true
  611. l.next()
  612. r := l.peek()
  613. if r == '+' || r == '-' {
  614. l.next()
  615. }
  616. } else if isDigit(next) {
  617. digitSeen = true
  618. l.next()
  619. } else if next == '_' {
  620. l.next()
  621. } else {
  622. break
  623. }
  624. if pointSeen && !digitSeen {
  625. return l.errorf("cannot start float with a dot")
  626. }
  627. }
  628. if !digitSeen {
  629. return l.errorf("no digit in that number")
  630. }
  631. if pointSeen || expSeen {
  632. l.emit(tokenFloat)
  633. } else {
  634. l.emit(tokenInteger)
  635. }
  636. return l.lexRvalue
  637. }
  638. func (l *tomlLexer) run() {
  639. for state := l.lexVoid; state != nil; {
  640. state = state()
  641. }
  642. }
  643. func init() {
  644. dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
  645. }
  646. // Entry point
  647. func lexToml(inputBytes []byte) []token {
  648. runes := bytes.Runes(inputBytes)
  649. l := &tomlLexer{
  650. input: runes,
  651. tokens: make([]token, 0, 256),
  652. line: 1,
  653. col: 1,
  654. endbufferLine: 1,
  655. endbufferCol: 1,
  656. }
  657. l.run()
  658. return l.tokens
  659. }