You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lex.go 22KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953
  1. package toml
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. type itemType int
  9. const (
  10. itemError itemType = iota
  11. itemNIL // used in the parser to indicate no type
  12. itemEOF
  13. itemText
  14. itemString
  15. itemRawString
  16. itemMultilineString
  17. itemRawMultilineString
  18. itemBool
  19. itemInteger
  20. itemFloat
  21. itemDatetime
  22. itemArray // the start of an array
  23. itemArrayEnd
  24. itemTableStart
  25. itemTableEnd
  26. itemArrayTableStart
  27. itemArrayTableEnd
  28. itemKeyStart
  29. itemCommentStart
  30. itemInlineTableStart
  31. itemInlineTableEnd
  32. )
  33. const (
  34. eof = 0
  35. comma = ','
  36. tableStart = '['
  37. tableEnd = ']'
  38. arrayTableStart = '['
  39. arrayTableEnd = ']'
  40. tableSep = '.'
  41. keySep = '='
  42. arrayStart = '['
  43. arrayEnd = ']'
  44. commentStart = '#'
  45. stringStart = '"'
  46. stringEnd = '"'
  47. rawStringStart = '\''
  48. rawStringEnd = '\''
  49. inlineTableStart = '{'
  50. inlineTableEnd = '}'
  51. )
  52. type stateFn func(lx *lexer) stateFn
  53. type lexer struct {
  54. input string
  55. start int
  56. pos int
  57. line int
  58. state stateFn
  59. items chan item
  60. // Allow for backing up up to three runes.
  61. // This is necessary because TOML contains 3-rune tokens (""" and ''').
  62. prevWidths [3]int
  63. nprev int // how many of prevWidths are in use
  64. // If we emit an eof, we can still back up, but it is not OK to call
  65. // next again.
  66. atEOF bool
  67. // A stack of state functions used to maintain context.
  68. // The idea is to reuse parts of the state machine in various places.
  69. // For example, values can appear at the top level or within arbitrarily
  70. // nested arrays. The last state on the stack is used after a value has
  71. // been lexed. Similarly for comments.
  72. stack []stateFn
  73. }
  74. type item struct {
  75. typ itemType
  76. val string
  77. line int
  78. }
  79. func (lx *lexer) nextItem() item {
  80. for {
  81. select {
  82. case item := <-lx.items:
  83. return item
  84. default:
  85. lx.state = lx.state(lx)
  86. }
  87. }
  88. }
  89. func lex(input string) *lexer {
  90. lx := &lexer{
  91. input: input,
  92. state: lexTop,
  93. line: 1,
  94. items: make(chan item, 10),
  95. stack: make([]stateFn, 0, 10),
  96. }
  97. return lx
  98. }
  99. func (lx *lexer) push(state stateFn) {
  100. lx.stack = append(lx.stack, state)
  101. }
  102. func (lx *lexer) pop() stateFn {
  103. if len(lx.stack) == 0 {
  104. return lx.errorf("BUG in lexer: no states to pop")
  105. }
  106. last := lx.stack[len(lx.stack)-1]
  107. lx.stack = lx.stack[0 : len(lx.stack)-1]
  108. return last
  109. }
  110. func (lx *lexer) current() string {
  111. return lx.input[lx.start:lx.pos]
  112. }
  113. func (lx *lexer) emit(typ itemType) {
  114. lx.items <- item{typ, lx.current(), lx.line}
  115. lx.start = lx.pos
  116. }
  117. func (lx *lexer) emitTrim(typ itemType) {
  118. lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line}
  119. lx.start = lx.pos
  120. }
  121. func (lx *lexer) next() (r rune) {
  122. if lx.atEOF {
  123. panic("next called after EOF")
  124. }
  125. if lx.pos >= len(lx.input) {
  126. lx.atEOF = true
  127. return eof
  128. }
  129. if lx.input[lx.pos] == '\n' {
  130. lx.line++
  131. }
  132. lx.prevWidths[2] = lx.prevWidths[1]
  133. lx.prevWidths[1] = lx.prevWidths[0]
  134. if lx.nprev < 3 {
  135. lx.nprev++
  136. }
  137. r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
  138. lx.prevWidths[0] = w
  139. lx.pos += w
  140. return r
  141. }
  142. // ignore skips over the pending input before this point.
  143. func (lx *lexer) ignore() {
  144. lx.start = lx.pos
  145. }
  146. // backup steps back one rune. Can be called only twice between calls to next.
  147. func (lx *lexer) backup() {
  148. if lx.atEOF {
  149. lx.atEOF = false
  150. return
  151. }
  152. if lx.nprev < 1 {
  153. panic("backed up too far")
  154. }
  155. w := lx.prevWidths[0]
  156. lx.prevWidths[0] = lx.prevWidths[1]
  157. lx.prevWidths[1] = lx.prevWidths[2]
  158. lx.nprev--
  159. lx.pos -= w
  160. if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
  161. lx.line--
  162. }
  163. }
  164. // accept consumes the next rune if it's equal to `valid`.
  165. func (lx *lexer) accept(valid rune) bool {
  166. if lx.next() == valid {
  167. return true
  168. }
  169. lx.backup()
  170. return false
  171. }
  172. // peek returns but does not consume the next rune in the input.
  173. func (lx *lexer) peek() rune {
  174. r := lx.next()
  175. lx.backup()
  176. return r
  177. }
  178. // skip ignores all input that matches the given predicate.
  179. func (lx *lexer) skip(pred func(rune) bool) {
  180. for {
  181. r := lx.next()
  182. if pred(r) {
  183. continue
  184. }
  185. lx.backup()
  186. lx.ignore()
  187. return
  188. }
  189. }
  190. // errorf stops all lexing by emitting an error and returning `nil`.
  191. // Note that any value that is a character is escaped if it's a special
  192. // character (newlines, tabs, etc.).
  193. func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
  194. lx.items <- item{
  195. itemError,
  196. fmt.Sprintf(format, values...),
  197. lx.line,
  198. }
  199. return nil
  200. }
  201. // lexTop consumes elements at the top level of TOML data.
  202. func lexTop(lx *lexer) stateFn {
  203. r := lx.next()
  204. if isWhitespace(r) || isNL(r) {
  205. return lexSkip(lx, lexTop)
  206. }
  207. switch r {
  208. case commentStart:
  209. lx.push(lexTop)
  210. return lexCommentStart
  211. case tableStart:
  212. return lexTableStart
  213. case eof:
  214. if lx.pos > lx.start {
  215. return lx.errorf("unexpected EOF")
  216. }
  217. lx.emit(itemEOF)
  218. return nil
  219. }
  220. // At this point, the only valid item can be a key, so we back up
  221. // and let the key lexer do the rest.
  222. lx.backup()
  223. lx.push(lexTopEnd)
  224. return lexKeyStart
  225. }
  226. // lexTopEnd is entered whenever a top-level item has been consumed. (A value
  227. // or a table.) It must see only whitespace, and will turn back to lexTop
  228. // upon a newline. If it sees EOF, it will quit the lexer successfully.
  229. func lexTopEnd(lx *lexer) stateFn {
  230. r := lx.next()
  231. switch {
  232. case r == commentStart:
  233. // a comment will read to a newline for us.
  234. lx.push(lexTop)
  235. return lexCommentStart
  236. case isWhitespace(r):
  237. return lexTopEnd
  238. case isNL(r):
  239. lx.ignore()
  240. return lexTop
  241. case r == eof:
  242. lx.emit(itemEOF)
  243. return nil
  244. }
  245. return lx.errorf("expected a top-level item to end with a newline, "+
  246. "comment, or EOF, but got %q instead", r)
  247. }
  248. // lexTable lexes the beginning of a table. Namely, it makes sure that
  249. // it starts with a character other than '.' and ']'.
  250. // It assumes that '[' has already been consumed.
  251. // It also handles the case that this is an item in an array of tables.
  252. // e.g., '[[name]]'.
  253. func lexTableStart(lx *lexer) stateFn {
  254. if lx.peek() == arrayTableStart {
  255. lx.next()
  256. lx.emit(itemArrayTableStart)
  257. lx.push(lexArrayTableEnd)
  258. } else {
  259. lx.emit(itemTableStart)
  260. lx.push(lexTableEnd)
  261. }
  262. return lexTableNameStart
  263. }
  264. func lexTableEnd(lx *lexer) stateFn {
  265. lx.emit(itemTableEnd)
  266. return lexTopEnd
  267. }
  268. func lexArrayTableEnd(lx *lexer) stateFn {
  269. if r := lx.next(); r != arrayTableEnd {
  270. return lx.errorf("expected end of table array name delimiter %q, "+
  271. "but got %q instead", arrayTableEnd, r)
  272. }
  273. lx.emit(itemArrayTableEnd)
  274. return lexTopEnd
  275. }
  276. func lexTableNameStart(lx *lexer) stateFn {
  277. lx.skip(isWhitespace)
  278. switch r := lx.peek(); {
  279. case r == tableEnd || r == eof:
  280. return lx.errorf("unexpected end of table name " +
  281. "(table names cannot be empty)")
  282. case r == tableSep:
  283. return lx.errorf("unexpected table separator " +
  284. "(table names cannot be empty)")
  285. case r == stringStart || r == rawStringStart:
  286. lx.ignore()
  287. lx.push(lexTableNameEnd)
  288. return lexValue // reuse string lexing
  289. default:
  290. return lexBareTableName
  291. }
  292. }
  293. // lexBareTableName lexes the name of a table. It assumes that at least one
  294. // valid character for the table has already been read.
  295. func lexBareTableName(lx *lexer) stateFn {
  296. r := lx.next()
  297. if isBareKeyChar(r) {
  298. return lexBareTableName
  299. }
  300. lx.backup()
  301. lx.emit(itemText)
  302. return lexTableNameEnd
  303. }
  304. // lexTableNameEnd reads the end of a piece of a table name, optionally
  305. // consuming whitespace.
  306. func lexTableNameEnd(lx *lexer) stateFn {
  307. lx.skip(isWhitespace)
  308. switch r := lx.next(); {
  309. case isWhitespace(r):
  310. return lexTableNameEnd
  311. case r == tableSep:
  312. lx.ignore()
  313. return lexTableNameStart
  314. case r == tableEnd:
  315. return lx.pop()
  316. default:
  317. return lx.errorf("expected '.' or ']' to end table name, "+
  318. "but got %q instead", r)
  319. }
  320. }
  321. // lexKeyStart consumes a key name up until the first non-whitespace character.
  322. // lexKeyStart will ignore whitespace.
  323. func lexKeyStart(lx *lexer) stateFn {
  324. r := lx.peek()
  325. switch {
  326. case r == keySep:
  327. return lx.errorf("unexpected key separator %q", keySep)
  328. case isWhitespace(r) || isNL(r):
  329. lx.next()
  330. return lexSkip(lx, lexKeyStart)
  331. case r == stringStart || r == rawStringStart:
  332. lx.ignore()
  333. lx.emit(itemKeyStart)
  334. lx.push(lexKeyEnd)
  335. return lexValue // reuse string lexing
  336. default:
  337. lx.ignore()
  338. lx.emit(itemKeyStart)
  339. return lexBareKey
  340. }
  341. }
  342. // lexBareKey consumes the text of a bare key. Assumes that the first character
  343. // (which is not whitespace) has not yet been consumed.
  344. func lexBareKey(lx *lexer) stateFn {
  345. switch r := lx.next(); {
  346. case isBareKeyChar(r):
  347. return lexBareKey
  348. case isWhitespace(r):
  349. lx.backup()
  350. lx.emit(itemText)
  351. return lexKeyEnd
  352. case r == keySep:
  353. lx.backup()
  354. lx.emit(itemText)
  355. return lexKeyEnd
  356. default:
  357. return lx.errorf("bare keys cannot contain %q", r)
  358. }
  359. }
  360. // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
  361. // separator).
  362. func lexKeyEnd(lx *lexer) stateFn {
  363. switch r := lx.next(); {
  364. case r == keySep:
  365. return lexSkip(lx, lexValue)
  366. case isWhitespace(r):
  367. return lexSkip(lx, lexKeyEnd)
  368. default:
  369. return lx.errorf("expected key separator %q, but got %q instead",
  370. keySep, r)
  371. }
  372. }
  373. // lexValue starts the consumption of a value anywhere a value is expected.
  374. // lexValue will ignore whitespace.
  375. // After a value is lexed, the last state on the next is popped and returned.
  376. func lexValue(lx *lexer) stateFn {
  377. // We allow whitespace to precede a value, but NOT newlines.
  378. // In array syntax, the array states are responsible for ignoring newlines.
  379. r := lx.next()
  380. switch {
  381. case isWhitespace(r):
  382. return lexSkip(lx, lexValue)
  383. case isDigit(r):
  384. lx.backup() // avoid an extra state and use the same as above
  385. return lexNumberOrDateStart
  386. }
  387. switch r {
  388. case arrayStart:
  389. lx.ignore()
  390. lx.emit(itemArray)
  391. return lexArrayValue
  392. case inlineTableStart:
  393. lx.ignore()
  394. lx.emit(itemInlineTableStart)
  395. return lexInlineTableValue
  396. case stringStart:
  397. if lx.accept(stringStart) {
  398. if lx.accept(stringStart) {
  399. lx.ignore() // Ignore """
  400. return lexMultilineString
  401. }
  402. lx.backup()
  403. }
  404. lx.ignore() // ignore the '"'
  405. return lexString
  406. case rawStringStart:
  407. if lx.accept(rawStringStart) {
  408. if lx.accept(rawStringStart) {
  409. lx.ignore() // Ignore """
  410. return lexMultilineRawString
  411. }
  412. lx.backup()
  413. }
  414. lx.ignore() // ignore the "'"
  415. return lexRawString
  416. case '+', '-':
  417. return lexNumberStart
  418. case '.': // special error case, be kind to users
  419. return lx.errorf("floats must start with a digit, not '.'")
  420. }
  421. if unicode.IsLetter(r) {
  422. // Be permissive here; lexBool will give a nice error if the
  423. // user wrote something like
  424. // x = foo
  425. // (i.e. not 'true' or 'false' but is something else word-like.)
  426. lx.backup()
  427. return lexBool
  428. }
  429. return lx.errorf("expected value but found %q instead", r)
  430. }
  431. // lexArrayValue consumes one value in an array. It assumes that '[' or ','
  432. // have already been consumed. All whitespace and newlines are ignored.
  433. func lexArrayValue(lx *lexer) stateFn {
  434. r := lx.next()
  435. switch {
  436. case isWhitespace(r) || isNL(r):
  437. return lexSkip(lx, lexArrayValue)
  438. case r == commentStart:
  439. lx.push(lexArrayValue)
  440. return lexCommentStart
  441. case r == comma:
  442. return lx.errorf("unexpected comma")
  443. case r == arrayEnd:
  444. // NOTE(caleb): The spec isn't clear about whether you can have
  445. // a trailing comma or not, so we'll allow it.
  446. return lexArrayEnd
  447. }
  448. lx.backup()
  449. lx.push(lexArrayValueEnd)
  450. return lexValue
  451. }
  452. // lexArrayValueEnd consumes everything between the end of an array value and
  453. // the next value (or the end of the array): it ignores whitespace and newlines
  454. // and expects either a ',' or a ']'.
  455. func lexArrayValueEnd(lx *lexer) stateFn {
  456. r := lx.next()
  457. switch {
  458. case isWhitespace(r) || isNL(r):
  459. return lexSkip(lx, lexArrayValueEnd)
  460. case r == commentStart:
  461. lx.push(lexArrayValueEnd)
  462. return lexCommentStart
  463. case r == comma:
  464. lx.ignore()
  465. return lexArrayValue // move on to the next value
  466. case r == arrayEnd:
  467. return lexArrayEnd
  468. }
  469. return lx.errorf(
  470. "expected a comma or array terminator %q, but got %q instead",
  471. arrayEnd, r,
  472. )
  473. }
  474. // lexArrayEnd finishes the lexing of an array.
  475. // It assumes that a ']' has just been consumed.
  476. func lexArrayEnd(lx *lexer) stateFn {
  477. lx.ignore()
  478. lx.emit(itemArrayEnd)
  479. return lx.pop()
  480. }
  481. // lexInlineTableValue consumes one key/value pair in an inline table.
  482. // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
  483. func lexInlineTableValue(lx *lexer) stateFn {
  484. r := lx.next()
  485. switch {
  486. case isWhitespace(r):
  487. return lexSkip(lx, lexInlineTableValue)
  488. case isNL(r):
  489. return lx.errorf("newlines not allowed within inline tables")
  490. case r == commentStart:
  491. lx.push(lexInlineTableValue)
  492. return lexCommentStart
  493. case r == comma:
  494. return lx.errorf("unexpected comma")
  495. case r == inlineTableEnd:
  496. return lexInlineTableEnd
  497. }
  498. lx.backup()
  499. lx.push(lexInlineTableValueEnd)
  500. return lexKeyStart
  501. }
  502. // lexInlineTableValueEnd consumes everything between the end of an inline table
  503. // key/value pair and the next pair (or the end of the table):
  504. // it ignores whitespace and expects either a ',' or a '}'.
  505. func lexInlineTableValueEnd(lx *lexer) stateFn {
  506. r := lx.next()
  507. switch {
  508. case isWhitespace(r):
  509. return lexSkip(lx, lexInlineTableValueEnd)
  510. case isNL(r):
  511. return lx.errorf("newlines not allowed within inline tables")
  512. case r == commentStart:
  513. lx.push(lexInlineTableValueEnd)
  514. return lexCommentStart
  515. case r == comma:
  516. lx.ignore()
  517. return lexInlineTableValue
  518. case r == inlineTableEnd:
  519. return lexInlineTableEnd
  520. }
  521. return lx.errorf("expected a comma or an inline table terminator %q, "+
  522. "but got %q instead", inlineTableEnd, r)
  523. }
  524. // lexInlineTableEnd finishes the lexing of an inline table.
  525. // It assumes that a '}' has just been consumed.
  526. func lexInlineTableEnd(lx *lexer) stateFn {
  527. lx.ignore()
  528. lx.emit(itemInlineTableEnd)
  529. return lx.pop()
  530. }
  531. // lexString consumes the inner contents of a string. It assumes that the
  532. // beginning '"' has already been consumed and ignored.
  533. func lexString(lx *lexer) stateFn {
  534. r := lx.next()
  535. switch {
  536. case r == eof:
  537. return lx.errorf("unexpected EOF")
  538. case isNL(r):
  539. return lx.errorf("strings cannot contain newlines")
  540. case r == '\\':
  541. lx.push(lexString)
  542. return lexStringEscape
  543. case r == stringEnd:
  544. lx.backup()
  545. lx.emit(itemString)
  546. lx.next()
  547. lx.ignore()
  548. return lx.pop()
  549. }
  550. return lexString
  551. }
  552. // lexMultilineString consumes the inner contents of a string. It assumes that
  553. // the beginning '"""' has already been consumed and ignored.
  554. func lexMultilineString(lx *lexer) stateFn {
  555. switch lx.next() {
  556. case eof:
  557. return lx.errorf("unexpected EOF")
  558. case '\\':
  559. return lexMultilineStringEscape
  560. case stringEnd:
  561. if lx.accept(stringEnd) {
  562. if lx.accept(stringEnd) {
  563. lx.backup()
  564. lx.backup()
  565. lx.backup()
  566. lx.emit(itemMultilineString)
  567. lx.next()
  568. lx.next()
  569. lx.next()
  570. lx.ignore()
  571. return lx.pop()
  572. }
  573. lx.backup()
  574. }
  575. }
  576. return lexMultilineString
  577. }
  578. // lexRawString consumes a raw string. Nothing can be escaped in such a string.
  579. // It assumes that the beginning "'" has already been consumed and ignored.
  580. func lexRawString(lx *lexer) stateFn {
  581. r := lx.next()
  582. switch {
  583. case r == eof:
  584. return lx.errorf("unexpected EOF")
  585. case isNL(r):
  586. return lx.errorf("strings cannot contain newlines")
  587. case r == rawStringEnd:
  588. lx.backup()
  589. lx.emit(itemRawString)
  590. lx.next()
  591. lx.ignore()
  592. return lx.pop()
  593. }
  594. return lexRawString
  595. }
  596. // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
  597. // a string. It assumes that the beginning "'''" has already been consumed and
  598. // ignored.
  599. func lexMultilineRawString(lx *lexer) stateFn {
  600. switch lx.next() {
  601. case eof:
  602. return lx.errorf("unexpected EOF")
  603. case rawStringEnd:
  604. if lx.accept(rawStringEnd) {
  605. if lx.accept(rawStringEnd) {
  606. lx.backup()
  607. lx.backup()
  608. lx.backup()
  609. lx.emit(itemRawMultilineString)
  610. lx.next()
  611. lx.next()
  612. lx.next()
  613. lx.ignore()
  614. return lx.pop()
  615. }
  616. lx.backup()
  617. }
  618. }
  619. return lexMultilineRawString
  620. }
  621. // lexMultilineStringEscape consumes an escaped character. It assumes that the
  622. // preceding '\\' has already been consumed.
  623. func lexMultilineStringEscape(lx *lexer) stateFn {
  624. // Handle the special case first:
  625. if isNL(lx.next()) {
  626. return lexMultilineString
  627. }
  628. lx.backup()
  629. lx.push(lexMultilineString)
  630. return lexStringEscape(lx)
  631. }
  632. func lexStringEscape(lx *lexer) stateFn {
  633. r := lx.next()
  634. switch r {
  635. case 'b':
  636. fallthrough
  637. case 't':
  638. fallthrough
  639. case 'n':
  640. fallthrough
  641. case 'f':
  642. fallthrough
  643. case 'r':
  644. fallthrough
  645. case '"':
  646. fallthrough
  647. case '\\':
  648. return lx.pop()
  649. case 'u':
  650. return lexShortUnicodeEscape
  651. case 'U':
  652. return lexLongUnicodeEscape
  653. }
  654. return lx.errorf("invalid escape character %q; only the following "+
  655. "escape characters are allowed: "+
  656. `\b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX`, r)
  657. }
  658. func lexShortUnicodeEscape(lx *lexer) stateFn {
  659. var r rune
  660. for i := 0; i < 4; i++ {
  661. r = lx.next()
  662. if !isHexadecimal(r) {
  663. return lx.errorf(`expected four hexadecimal digits after '\u', `+
  664. "but got %q instead", lx.current())
  665. }
  666. }
  667. return lx.pop()
  668. }
  669. func lexLongUnicodeEscape(lx *lexer) stateFn {
  670. var r rune
  671. for i := 0; i < 8; i++ {
  672. r = lx.next()
  673. if !isHexadecimal(r) {
  674. return lx.errorf(`expected eight hexadecimal digits after '\U', `+
  675. "but got %q instead", lx.current())
  676. }
  677. }
  678. return lx.pop()
  679. }
  680. // lexNumberOrDateStart consumes either an integer, a float, or datetime.
  681. func lexNumberOrDateStart(lx *lexer) stateFn {
  682. r := lx.next()
  683. if isDigit(r) {
  684. return lexNumberOrDate
  685. }
  686. switch r {
  687. case '_':
  688. return lexNumber
  689. case 'e', 'E':
  690. return lexFloat
  691. case '.':
  692. return lx.errorf("floats must start with a digit, not '.'")
  693. }
  694. return lx.errorf("expected a digit but got %q", r)
  695. }
  696. // lexNumberOrDate consumes either an integer, float or datetime.
  697. func lexNumberOrDate(lx *lexer) stateFn {
  698. r := lx.next()
  699. if isDigit(r) {
  700. return lexNumberOrDate
  701. }
  702. switch r {
  703. case '-':
  704. return lexDatetime
  705. case '_':
  706. return lexNumber
  707. case '.', 'e', 'E':
  708. return lexFloat
  709. }
  710. lx.backup()
  711. lx.emit(itemInteger)
  712. return lx.pop()
  713. }
  714. // lexDatetime consumes a Datetime, to a first approximation.
  715. // The parser validates that it matches one of the accepted formats.
  716. func lexDatetime(lx *lexer) stateFn {
  717. r := lx.next()
  718. if isDigit(r) {
  719. return lexDatetime
  720. }
  721. switch r {
  722. case '-', 'T', ':', '.', 'Z', '+':
  723. return lexDatetime
  724. }
  725. lx.backup()
  726. lx.emit(itemDatetime)
  727. return lx.pop()
  728. }
  729. // lexNumberStart consumes either an integer or a float. It assumes that a sign
  730. // has already been read, but that *no* digits have been consumed.
  731. // lexNumberStart will move to the appropriate integer or float states.
  732. func lexNumberStart(lx *lexer) stateFn {
  733. // We MUST see a digit. Even floats have to start with a digit.
  734. r := lx.next()
  735. if !isDigit(r) {
  736. if r == '.' {
  737. return lx.errorf("floats must start with a digit, not '.'")
  738. }
  739. return lx.errorf("expected a digit but got %q", r)
  740. }
  741. return lexNumber
  742. }
  743. // lexNumber consumes an integer or a float after seeing the first digit.
  744. func lexNumber(lx *lexer) stateFn {
  745. r := lx.next()
  746. if isDigit(r) {
  747. return lexNumber
  748. }
  749. switch r {
  750. case '_':
  751. return lexNumber
  752. case '.', 'e', 'E':
  753. return lexFloat
  754. }
  755. lx.backup()
  756. lx.emit(itemInteger)
  757. return lx.pop()
  758. }
  759. // lexFloat consumes the elements of a float. It allows any sequence of
  760. // float-like characters, so floats emitted by the lexer are only a first
  761. // approximation and must be validated by the parser.
  762. func lexFloat(lx *lexer) stateFn {
  763. r := lx.next()
  764. if isDigit(r) {
  765. return lexFloat
  766. }
  767. switch r {
  768. case '_', '.', '-', '+', 'e', 'E':
  769. return lexFloat
  770. }
  771. lx.backup()
  772. lx.emit(itemFloat)
  773. return lx.pop()
  774. }
  775. // lexBool consumes a bool string: 'true' or 'false.
  776. func lexBool(lx *lexer) stateFn {
  777. var rs []rune
  778. for {
  779. r := lx.next()
  780. if !unicode.IsLetter(r) {
  781. lx.backup()
  782. break
  783. }
  784. rs = append(rs, r)
  785. }
  786. s := string(rs)
  787. switch s {
  788. case "true", "false":
  789. lx.emit(itemBool)
  790. return lx.pop()
  791. }
  792. return lx.errorf("expected value but found %q instead", s)
  793. }
  794. // lexCommentStart begins the lexing of a comment. It will emit
  795. // itemCommentStart and consume no characters, passing control to lexComment.
  796. func lexCommentStart(lx *lexer) stateFn {
  797. lx.ignore()
  798. lx.emit(itemCommentStart)
  799. return lexComment
  800. }
  801. // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  802. // It will consume *up to* the first newline character, and pass control
  803. // back to the last state on the stack.
  804. func lexComment(lx *lexer) stateFn {
  805. r := lx.peek()
  806. if isNL(r) || r == eof {
  807. lx.emit(itemText)
  808. return lx.pop()
  809. }
  810. lx.next()
  811. return lexComment
  812. }
  813. // lexSkip ignores all slurped input and moves on to the next state.
  814. func lexSkip(lx *lexer, nextState stateFn) stateFn {
  815. return func(lx *lexer) stateFn {
  816. lx.ignore()
  817. return nextState
  818. }
  819. }
  820. // isWhitespace returns true if `r` is a whitespace character according
  821. // to the spec.
  822. func isWhitespace(r rune) bool {
  823. return r == '\t' || r == ' '
  824. }
  825. func isNL(r rune) bool {
  826. return r == '\n' || r == '\r'
  827. }
  828. func isDigit(r rune) bool {
  829. return r >= '0' && r <= '9'
  830. }
  831. func isHexadecimal(r rune) bool {
  832. return (r >= '0' && r <= '9') ||
  833. (r >= 'a' && r <= 'f') ||
  834. (r >= 'A' && r <= 'F')
  835. }
  836. func isBareKeyChar(r rune) bool {
  837. return (r >= 'A' && r <= 'Z') ||
  838. (r >= 'a' && r <= 'z') ||
  839. (r >= '0' && r <= '9') ||
  840. r == '_' ||
  841. r == '-'
  842. }
  843. func (itype itemType) String() string {
  844. switch itype {
  845. case itemError:
  846. return "Error"
  847. case itemNIL:
  848. return "NIL"
  849. case itemEOF:
  850. return "EOF"
  851. case itemText:
  852. return "Text"
  853. case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  854. return "String"
  855. case itemBool:
  856. return "Bool"
  857. case itemInteger:
  858. return "Integer"
  859. case itemFloat:
  860. return "Float"
  861. case itemDatetime:
  862. return "DateTime"
  863. case itemTableStart:
  864. return "TableStart"
  865. case itemTableEnd:
  866. return "TableEnd"
  867. case itemKeyStart:
  868. return "KeyStart"
  869. case itemArray:
  870. return "Array"
  871. case itemArrayEnd:
  872. return "ArrayEnd"
  873. case itemCommentStart:
  874. return "CommentStart"
  875. }
  876. panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  877. }
  878. func (item item) String() string {
  879. return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  880. }