You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scanner.go 7.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package scanner implements a scanner for gcfg configuration text.
  5. // It takes a []byte as source which can then be tokenized
  6. // through repeated calls to the Scan method.
  7. //
  8. // Note that the API for the scanner package may change to accommodate new
  9. // features or implementation changes in gcfg.
  10. //
  11. package scanner
  12. import (
  13. "fmt"
  14. "path/filepath"
  15. "unicode"
  16. "unicode/utf8"
  17. )
  18. import (
  19. "github.com/src-d/gcfg/token"
  20. )
  21. // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
  22. // encountered and a handler was installed, the handler is called with a
  23. // position and an error message. The position points to the beginning of
  24. // the offending token.
  25. //
  26. type ErrorHandler func(pos token.Position, msg string)
  27. // A Scanner holds the scanner's internal state while processing
  28. // a given text. It can be allocated as part of another data
  29. // structure but must be initialized via Init before use.
  30. //
  31. type Scanner struct {
  32. // immutable state
  33. file *token.File // source file handle
  34. dir string // directory portion of file.Name()
  35. src []byte // source
  36. err ErrorHandler // error reporting; or nil
  37. mode Mode // scanning mode
  38. // scanning state
  39. ch rune // current character
  40. offset int // character offset
  41. rdOffset int // reading offset (position after current character)
  42. lineOffset int // current line offset
  43. nextVal bool // next token is expected to be a value
  44. // public state - ok to modify
  45. ErrorCount int // number of errors encountered
  46. }
  47. // Read the next Unicode char into s.ch.
  48. // s.ch < 0 means end-of-file.
  49. //
  50. func (s *Scanner) next() {
  51. if s.rdOffset < len(s.src) {
  52. s.offset = s.rdOffset
  53. if s.ch == '\n' {
  54. s.lineOffset = s.offset
  55. s.file.AddLine(s.offset)
  56. }
  57. r, w := rune(s.src[s.rdOffset]), 1
  58. switch {
  59. case r == 0:
  60. s.error(s.offset, "illegal character NUL")
  61. case r >= 0x80:
  62. // not ASCII
  63. r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  64. if r == utf8.RuneError && w == 1 {
  65. s.error(s.offset, "illegal UTF-8 encoding")
  66. }
  67. }
  68. s.rdOffset += w
  69. s.ch = r
  70. } else {
  71. s.offset = len(s.src)
  72. if s.ch == '\n' {
  73. s.lineOffset = s.offset
  74. s.file.AddLine(s.offset)
  75. }
  76. s.ch = -1 // eof
  77. }
  78. }
  79. // A mode value is a set of flags (or 0).
  80. // They control scanner behavior.
  81. //
  82. type Mode uint
  83. const (
  84. ScanComments Mode = 1 << iota // return comments as COMMENT tokens
  85. )
  86. // Init prepares the scanner s to tokenize the text src by setting the
  87. // scanner at the beginning of src. The scanner uses the file set file
  88. // for position information and it adds line information for each line.
  89. // It is ok to re-use the same file when re-scanning the same file as
  90. // line information which is already present is ignored. Init causes a
  91. // panic if the file size does not match the src size.
  92. //
  93. // Calls to Scan will invoke the error handler err if they encounter a
  94. // syntax error and err is not nil. Also, for each error encountered,
  95. // the Scanner field ErrorCount is incremented by one. The mode parameter
  96. // determines how comments are handled.
  97. //
  98. // Note that Init may call err if there is an error in the first character
  99. // of the file.
  100. //
  101. func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
  102. // Explicitly initialize all fields since a scanner may be reused.
  103. if file.Size() != len(src) {
  104. panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
  105. }
  106. s.file = file
  107. s.dir, _ = filepath.Split(file.Name())
  108. s.src = src
  109. s.err = err
  110. s.mode = mode
  111. s.ch = ' '
  112. s.offset = 0
  113. s.rdOffset = 0
  114. s.lineOffset = 0
  115. s.ErrorCount = 0
  116. s.nextVal = false
  117. s.next()
  118. }
  119. func (s *Scanner) error(offs int, msg string) {
  120. if s.err != nil {
  121. s.err(s.file.Position(s.file.Pos(offs)), msg)
  122. }
  123. s.ErrorCount++
  124. }
  125. func (s *Scanner) scanComment() string {
  126. // initial [;#] already consumed
  127. offs := s.offset - 1 // position of initial [;#]
  128. for s.ch != '\n' && s.ch >= 0 {
  129. s.next()
  130. }
  131. return string(s.src[offs:s.offset])
  132. }
  133. func isLetter(ch rune) bool {
  134. return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= 0x80 && unicode.IsLetter(ch)
  135. }
  136. func isDigit(ch rune) bool {
  137. return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
  138. }
  139. func (s *Scanner) scanIdentifier() string {
  140. offs := s.offset
  141. for isLetter(s.ch) || isDigit(s.ch) || s.ch == '-' {
  142. s.next()
  143. }
  144. return string(s.src[offs:s.offset])
  145. }
  146. func (s *Scanner) scanEscape(val bool) {
  147. offs := s.offset
  148. ch := s.ch
  149. s.next() // always make progress
  150. switch ch {
  151. case '\\', '"':
  152. // ok
  153. case 'n', 't', 'b':
  154. if val {
  155. break // ok
  156. }
  157. fallthrough
  158. default:
  159. s.error(offs, "unknown escape sequence")
  160. }
  161. }
  162. func (s *Scanner) scanString() string {
  163. // '"' opening already consumed
  164. offs := s.offset - 1
  165. for s.ch != '"' {
  166. ch := s.ch
  167. s.next()
  168. if ch == '\n' || ch < 0 {
  169. s.error(offs, "string not terminated")
  170. break
  171. }
  172. if ch == '\\' {
  173. s.scanEscape(false)
  174. }
  175. }
  176. s.next()
  177. return string(s.src[offs:s.offset])
  178. }
  179. func stripCR(b []byte) []byte {
  180. c := make([]byte, len(b))
  181. i := 0
  182. for _, ch := range b {
  183. if ch != '\r' {
  184. c[i] = ch
  185. i++
  186. }
  187. }
  188. return c[:i]
  189. }
  190. func (s *Scanner) scanValString() string {
  191. offs := s.offset
  192. hasCR := false
  193. end := offs
  194. inQuote := false
  195. loop:
  196. for inQuote || s.ch >= 0 && s.ch != '\n' && s.ch != ';' && s.ch != '#' {
  197. ch := s.ch
  198. s.next()
  199. switch {
  200. case inQuote && ch == '\\':
  201. s.scanEscape(true)
  202. case !inQuote && ch == '\\':
  203. if s.ch == '\r' {
  204. hasCR = true
  205. s.next()
  206. }
  207. if s.ch != '\n' {
  208. s.scanEscape(true)
  209. } else {
  210. s.next()
  211. }
  212. case ch == '"':
  213. inQuote = !inQuote
  214. case ch == '\r':
  215. hasCR = true
  216. case ch < 0 || inQuote && ch == '\n':
  217. s.error(offs, "string not terminated")
  218. break loop
  219. }
  220. if inQuote || !isWhiteSpace(ch) {
  221. end = s.offset
  222. }
  223. }
  224. lit := s.src[offs:end]
  225. if hasCR {
  226. lit = stripCR(lit)
  227. }
  228. return string(lit)
  229. }
  230. func isWhiteSpace(ch rune) bool {
  231. return ch == ' ' || ch == '\t' || ch == '\r'
  232. }
  233. func (s *Scanner) skipWhitespace() {
  234. for isWhiteSpace(s.ch) {
  235. s.next()
  236. }
  237. }
  238. // Scan scans the next token and returns the token position, the token,
  239. // and its literal string if applicable. The source end is indicated by
  240. // token.EOF.
  241. //
  242. // If the returned token is a literal (token.IDENT, token.STRING) or
  243. // token.COMMENT, the literal string has the corresponding value.
  244. //
  245. // If the returned token is token.ILLEGAL, the literal string is the
  246. // offending character.
  247. //
  248. // In all other cases, Scan returns an empty literal string.
  249. //
  250. // For more tolerant parsing, Scan will return a valid token if
  251. // possible even if a syntax error was encountered. Thus, even
  252. // if the resulting token sequence contains no illegal tokens,
  253. // a client may not assume that no error occurred. Instead it
  254. // must check the scanner's ErrorCount or the number of calls
  255. // of the error handler, if there was one installed.
  256. //
  257. // Scan adds line information to the file added to the file
  258. // set with Init. Token positions are relative to that file
  259. // and thus relative to the file set.
  260. //
  261. func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
  262. scanAgain:
  263. s.skipWhitespace()
  264. // current token start
  265. pos = s.file.Pos(s.offset)
  266. // determine token value
  267. switch ch := s.ch; {
  268. case s.nextVal:
  269. lit = s.scanValString()
  270. tok = token.STRING
  271. s.nextVal = false
  272. case isLetter(ch):
  273. lit = s.scanIdentifier()
  274. tok = token.IDENT
  275. default:
  276. s.next() // always make progress
  277. switch ch {
  278. case -1:
  279. tok = token.EOF
  280. case '\n':
  281. tok = token.EOL
  282. case '"':
  283. tok = token.STRING
  284. lit = s.scanString()
  285. case '[':
  286. tok = token.LBRACK
  287. case ']':
  288. tok = token.RBRACK
  289. case ';', '#':
  290. // comment
  291. lit = s.scanComment()
  292. if s.mode&ScanComments == 0 {
  293. // skip comment
  294. goto scanAgain
  295. }
  296. tok = token.COMMENT
  297. case '=':
  298. tok = token.ASSIGN
  299. s.nextVal = true
  300. default:
  301. s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
  302. tok = token.ILLEGAL
  303. lit = string(ch)
  304. }
  305. }
  306. return
  307. }