You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

parser.go 17KB


  1. // Package cascadia is an implementation of CSS selectors.
  2. package cascadia
  3. import (
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. )
  10. // a parser for CSS selectors
  11. type parser struct {
  12. s string // the source text
  13. i int // the current position
  14. }
  15. // parseEscape parses a backslash escape.
  16. func (p *parser) parseEscape() (result string, err error) {
  17. if len(p.s) < p.i+2 || p.s[p.i] != '\\' {
  18. return "", errors.New("invalid escape sequence")
  19. }
  20. start := p.i + 1
  21. c := p.s[start]
  22. switch {
  23. case c == '\r' || c == '\n' || c == '\f':
  24. return "", errors.New("escaped line ending outside string")
  25. case hexDigit(c):
  26. // unicode escape (hex)
  27. var i int
  28. for i = start; i < p.i+6 && i < len(p.s) && hexDigit(p.s[i]); i++ {
  29. // empty
  30. }
  31. v, _ := strconv.ParseUint(p.s[start:i], 16, 21)
  32. if len(p.s) > i {
  33. switch p.s[i] {
  34. case '\r':
  35. i++
  36. if len(p.s) > i && p.s[i] == '\n' {
  37. i++
  38. }
  39. case ' ', '\t', '\n', '\f':
  40. i++
  41. }
  42. }
  43. p.i = i
  44. return string(rune(v)), nil
  45. }
  46. // Return the literal character after the backslash.
  47. result = p.s[start : start+1]
  48. p.i += 2
  49. return result, nil
  50. }
  51. // toLowerASCII returns s with all ASCII capital letters lowercased.
  52. func toLowerASCII(s string) string {
  53. var b []byte
  54. for i := 0; i < len(s); i++ {
  55. if c := s[i]; 'A' <= c && c <= 'Z' {
  56. if b == nil {
  57. b = make([]byte, len(s))
  58. copy(b, s)
  59. }
  60. b[i] = s[i] + ('a' - 'A')
  61. }
  62. }
  63. if b == nil {
  64. return s
  65. }
  66. return string(b)
  67. }
  68. func hexDigit(c byte) bool {
  69. return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
  70. }
  71. // nameStart returns whether c can be the first character of an identifier
  72. // (not counting an initial hyphen, or an escape sequence).
  73. func nameStart(c byte) bool {
  74. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127
  75. }
  76. // nameChar returns whether c can be a character within an identifier
  77. // (not counting an escape sequence).
  78. func nameChar(c byte) bool {
  79. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127 ||
  80. c == '-' || '0' <= c && c <= '9'
  81. }
  82. // parseIdentifier parses an identifier.
  83. func (p *parser) parseIdentifier() (result string, err error) {
  84. startingDash := false
  85. if len(p.s) > p.i && p.s[p.i] == '-' {
  86. startingDash = true
  87. p.i++
  88. }
  89. if len(p.s) <= p.i {
  90. return "", errors.New("expected identifier, found EOF instead")
  91. }
  92. if c := p.s[p.i]; !(nameStart(c) || c == '\\') {
  93. return "", fmt.Errorf("expected identifier, found %c instead", c)
  94. }
  95. result, err = p.parseName()
  96. if startingDash && err == nil {
  97. result = "-" + result
  98. }
  99. return
  100. }
  101. // parseName parses a name (which is like an identifier, but doesn't have
  102. // extra restrictions on the first character).
  103. func (p *parser) parseName() (result string, err error) {
  104. i := p.i
  105. loop:
  106. for i < len(p.s) {
  107. c := p.s[i]
  108. switch {
  109. case nameChar(c):
  110. start := i
  111. for i < len(p.s) && nameChar(p.s[i]) {
  112. i++
  113. }
  114. result += p.s[start:i]
  115. case c == '\\':
  116. p.i = i
  117. val, err := p.parseEscape()
  118. if err != nil {
  119. return "", err
  120. }
  121. i = p.i
  122. result += val
  123. default:
  124. break loop
  125. }
  126. }
  127. if result == "" {
  128. return "", errors.New("expected name, found EOF instead")
  129. }
  130. p.i = i
  131. return result, nil
  132. }
  133. // parseString parses a single- or double-quoted string.
  134. func (p *parser) parseString() (result string, err error) {
  135. i := p.i
  136. if len(p.s) < i+2 {
  137. return "", errors.New("expected string, found EOF instead")
  138. }
  139. quote := p.s[i]
  140. i++
  141. loop:
  142. for i < len(p.s) {
  143. switch p.s[i] {
  144. case '\\':
  145. if len(p.s) > i+1 {
  146. switch c := p.s[i+1]; c {
  147. case '\r':
  148. if len(p.s) > i+2 && p.s[i+2] == '\n' {
  149. i += 3
  150. continue loop
  151. }
  152. fallthrough
  153. case '\n', '\f':
  154. i += 2
  155. continue loop
  156. }
  157. }
  158. p.i = i
  159. val, err := p.parseEscape()
  160. if err != nil {
  161. return "", err
  162. }
  163. i = p.i
  164. result += val
  165. case quote:
  166. break loop
  167. case '\r', '\n', '\f':
  168. return "", errors.New("unexpected end of line in string")
  169. default:
  170. start := i
  171. for i < len(p.s) {
  172. if c := p.s[i]; c == quote || c == '\\' || c == '\r' || c == '\n' || c == '\f' {
  173. break
  174. }
  175. i++
  176. }
  177. result += p.s[start:i]
  178. }
  179. }
  180. if i >= len(p.s) {
  181. return "", errors.New("EOF in string")
  182. }
  183. // Consume the final quote.
  184. i++
  185. p.i = i
  186. return result, nil
  187. }
  188. // parseRegex parses a regular expression; the end is defined by encountering an
  189. // unmatched closing ')' or ']' which is not consumed
  190. func (p *parser) parseRegex() (rx *regexp.Regexp, err error) {
  191. i := p.i
  192. if len(p.s) < i+2 {
  193. return nil, errors.New("expected regular expression, found EOF instead")
  194. }
  195. // number of open parens or brackets;
  196. // when it becomes negative, finished parsing regex
  197. open := 0
  198. loop:
  199. for i < len(p.s) {
  200. switch p.s[i] {
  201. case '(', '[':
  202. open++
  203. case ')', ']':
  204. open--
  205. if open < 0 {
  206. break loop
  207. }
  208. }
  209. i++
  210. }
  211. if i >= len(p.s) {
  212. return nil, errors.New("EOF in regular expression")
  213. }
  214. rx, err = regexp.Compile(p.s[p.i:i])
  215. p.i = i
  216. return rx, err
  217. }
  218. // skipWhitespace consumes whitespace characters and comments.
  219. // It returns true if there was actually anything to skip.
  220. func (p *parser) skipWhitespace() bool {
  221. i := p.i
  222. for i < len(p.s) {
  223. switch p.s[i] {
  224. case ' ', '\t', '\r', '\n', '\f':
  225. i++
  226. continue
  227. case '/':
  228. if strings.HasPrefix(p.s[i:], "/*") {
  229. end := strings.Index(p.s[i+len("/*"):], "*/")
  230. if end != -1 {
  231. i += end + len("/**/")
  232. continue
  233. }
  234. }
  235. }
  236. break
  237. }
  238. if i > p.i {
  239. p.i = i
  240. return true
  241. }
  242. return false
  243. }
  244. // consumeParenthesis consumes an opening parenthesis and any following
  245. // whitespace. It returns true if there was actually a parenthesis to skip.
  246. func (p *parser) consumeParenthesis() bool {
  247. if p.i < len(p.s) && p.s[p.i] == '(' {
  248. p.i++
  249. p.skipWhitespace()
  250. return true
  251. }
  252. return false
  253. }
  254. // consumeClosingParenthesis consumes a closing parenthesis and any preceding
  255. // whitespace. It returns true if there was actually a parenthesis to skip.
  256. func (p *parser) consumeClosingParenthesis() bool {
  257. i := p.i
  258. p.skipWhitespace()
  259. if p.i < len(p.s) && p.s[p.i] == ')' {
  260. p.i++
  261. return true
  262. }
  263. p.i = i
  264. return false
  265. }
  266. // parseTypeSelector parses a type selector (one that matches by tag name).
  267. func (p *parser) parseTypeSelector() (result tagSelector, err error) {
  268. tag, err := p.parseIdentifier()
  269. if err != nil {
  270. return
  271. }
  272. return tagSelector{tag: toLowerASCII(tag)}, nil
  273. }
  274. // parseIDSelector parses a selector that matches by id attribute.
  275. func (p *parser) parseIDSelector() (idSelector, error) {
  276. if p.i >= len(p.s) {
  277. return idSelector{}, fmt.Errorf("expected id selector (#id), found EOF instead")
  278. }
  279. if p.s[p.i] != '#' {
  280. return idSelector{}, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i])
  281. }
  282. p.i++
  283. id, err := p.parseName()
  284. if err != nil {
  285. return idSelector{}, err
  286. }
  287. return idSelector{id: id}, nil
  288. }
  289. // parseClassSelector parses a selector that matches by class attribute.
  290. func (p *parser) parseClassSelector() (classSelector, error) {
  291. if p.i >= len(p.s) {
  292. return classSelector{}, fmt.Errorf("expected class selector (.class), found EOF instead")
  293. }
  294. if p.s[p.i] != '.' {
  295. return classSelector{}, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i])
  296. }
  297. p.i++
  298. class, err := p.parseIdentifier()
  299. if err != nil {
  300. return classSelector{}, err
  301. }
  302. return classSelector{class: class}, nil
  303. }
  304. // parseAttributeSelector parses a selector that matches by attribute value.
  305. func (p *parser) parseAttributeSelector() (attrSelector, error) {
  306. if p.i >= len(p.s) {
  307. return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead")
  308. }
  309. if p.s[p.i] != '[' {
  310. return attrSelector{}, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i])
  311. }
  312. p.i++
  313. p.skipWhitespace()
  314. key, err := p.parseIdentifier()
  315. if err != nil {
  316. return attrSelector{}, err
  317. }
  318. key = toLowerASCII(key)
  319. p.skipWhitespace()
  320. if p.i >= len(p.s) {
  321. return attrSelector{}, errors.New("unexpected EOF in attribute selector")
  322. }
  323. if p.s[p.i] == ']' {
  324. p.i++
  325. return attrSelector{key: key, operation: ""}, nil
  326. }
  327. if p.i+2 >= len(p.s) {
  328. return attrSelector{}, errors.New("unexpected EOF in attribute selector")
  329. }
  330. op := p.s[p.i : p.i+2]
  331. if op[0] == '=' {
  332. op = "="
  333. } else if op[1] != '=' {
  334. return attrSelector{}, fmt.Errorf(`expected equality operator, found "%s" instead`, op)
  335. }
  336. p.i += len(op)
  337. p.skipWhitespace()
  338. if p.i >= len(p.s) {
  339. return attrSelector{}, errors.New("unexpected EOF in attribute selector")
  340. }
  341. var val string
  342. var rx *regexp.Regexp
  343. if op == "#=" {
  344. rx, err = p.parseRegex()
  345. } else {
  346. switch p.s[p.i] {
  347. case '\'', '"':
  348. val, err = p.parseString()
  349. default:
  350. val, err = p.parseIdentifier()
  351. }
  352. }
  353. if err != nil {
  354. return attrSelector{}, err
  355. }
  356. p.skipWhitespace()
  357. if p.i >= len(p.s) {
  358. return attrSelector{}, errors.New("unexpected EOF in attribute selector")
  359. }
  360. if p.s[p.i] != ']' {
  361. return attrSelector{}, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i])
  362. }
  363. p.i++
  364. switch op {
  365. case "=", "!=", "~=", "|=", "^=", "$=", "*=", "#=":
  366. return attrSelector{key: key, val: val, operation: op, regexp: rx}, nil
  367. default:
  368. return attrSelector{}, fmt.Errorf("attribute operator %q is not supported", op)
  369. }
  370. }
  371. var errExpectedParenthesis = errors.New("expected '(' but didn't find it")
  372. var errExpectedClosingParenthesis = errors.New("expected ')' but didn't find it")
  373. var errUnmatchedParenthesis = errors.New("unmatched '('")
  374. // parsePseudoclassSelector parses a pseudoclass selector like :not(p)
  375. func (p *parser) parsePseudoclassSelector() (out Sel, err error) {
  376. if p.i >= len(p.s) {
  377. return nil, fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead")
  378. }
  379. if p.s[p.i] != ':' {
  380. return nil, fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i])
  381. }
  382. p.i++
  383. if p.s[p.i] == ':' { // we found a pseudo-element
  384. p.i++
  385. }
  386. name, err := p.parseIdentifier()
  387. if err != nil {
  388. return
  389. }
  390. name = toLowerASCII(name)
  391. switch name {
  392. case "not", "has", "haschild":
  393. if !p.consumeParenthesis() {
  394. return out, errExpectedParenthesis
  395. }
  396. sel, parseErr := p.parseSelectorGroup()
  397. if parseErr != nil {
  398. return out, parseErr
  399. }
  400. if !p.consumeClosingParenthesis() {
  401. return out, errExpectedClosingParenthesis
  402. }
  403. out = relativePseudoClassSelector{name: name, match: sel}
  404. case "contains", "containsown":
  405. if !p.consumeParenthesis() {
  406. return out, errExpectedParenthesis
  407. }
  408. if p.i == len(p.s) {
  409. return out, errUnmatchedParenthesis
  410. }
  411. var val string
  412. switch p.s[p.i] {
  413. case '\'', '"':
  414. val, err = p.parseString()
  415. default:
  416. val, err = p.parseIdentifier()
  417. }
  418. if err != nil {
  419. return out, err
  420. }
  421. val = strings.ToLower(val)
  422. p.skipWhitespace()
  423. if p.i >= len(p.s) {
  424. return out, errors.New("unexpected EOF in pseudo selector")
  425. }
  426. if !p.consumeClosingParenthesis() {
  427. return out, errExpectedClosingParenthesis
  428. }
  429. out = containsPseudoClassSelector{own: name == "containsown", value: val}
  430. case "matches", "matchesown":
  431. if !p.consumeParenthesis() {
  432. return out, errExpectedParenthesis
  433. }
  434. rx, err := p.parseRegex()
  435. if err != nil {
  436. return out, err
  437. }
  438. if p.i >= len(p.s) {
  439. return out, errors.New("unexpected EOF in pseudo selector")
  440. }
  441. if !p.consumeClosingParenthesis() {
  442. return out, errExpectedClosingParenthesis
  443. }
  444. out = regexpPseudoClassSelector{own: name == "matchesown", regexp: rx}
  445. case "nth-child", "nth-last-child", "nth-of-type", "nth-last-of-type":
  446. if !p.consumeParenthesis() {
  447. return out, errExpectedParenthesis
  448. }
  449. a, b, err := p.parseNth()
  450. if err != nil {
  451. return out, err
  452. }
  453. if !p.consumeClosingParenthesis() {
  454. return out, errExpectedClosingParenthesis
  455. }
  456. last := name == "nth-last-child" || name == "nth-last-of-type"
  457. ofType := name == "nth-of-type" || name == "nth-last-of-type"
  458. out = nthPseudoClassSelector{a: a, b: b, last: last, ofType: ofType}
  459. case "first-child":
  460. out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: false}
  461. case "last-child":
  462. out = nthPseudoClassSelector{a: 0, b: 1, ofType: false, last: true}
  463. case "first-of-type":
  464. out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: false}
  465. case "last-of-type":
  466. out = nthPseudoClassSelector{a: 0, b: 1, ofType: true, last: true}
  467. case "only-child":
  468. out = onlyChildPseudoClassSelector{ofType: false}
  469. case "only-of-type":
  470. out = onlyChildPseudoClassSelector{ofType: true}
  471. case "input":
  472. out = inputPseudoClassSelector{}
  473. case "empty":
  474. out = emptyElementPseudoClassSelector{}
  475. case "root":
  476. out = rootPseudoClassSelector{}
  477. case "after", "backdrop", "before", "cue", "first-letter", "first-line", "grammar-error", "marker", "placeholder", "selection", "spelling-error":
  478. return out, errors.New("pseudo-elements are not yet supported")
  479. default:
  480. return out, fmt.Errorf("unknown pseudoclass or pseudoelement :%s", name)
  481. }
  482. return
  483. }
  484. // parseInteger parses a decimal integer.
  485. func (p *parser) parseInteger() (int, error) {
  486. i := p.i
  487. start := i
  488. for i < len(p.s) && '0' <= p.s[i] && p.s[i] <= '9' {
  489. i++
  490. }
  491. if i == start {
  492. return 0, errors.New("expected integer, but didn't find it")
  493. }
  494. p.i = i
  495. val, err := strconv.Atoi(p.s[start:i])
  496. if err != nil {
  497. return 0, err
  498. }
  499. return val, nil
  500. }
  501. // parseNth parses the argument for :nth-child (normally of the form an+b).
  502. func (p *parser) parseNth() (a, b int, err error) {
  503. // initial state
  504. if p.i >= len(p.s) {
  505. goto eof
  506. }
  507. switch p.s[p.i] {
  508. case '-':
  509. p.i++
  510. goto negativeA
  511. case '+':
  512. p.i++
  513. goto positiveA
  514. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  515. goto positiveA
  516. case 'n', 'N':
  517. a = 1
  518. p.i++
  519. goto readN
  520. case 'o', 'O', 'e', 'E':
  521. id, nameErr := p.parseName()
  522. if nameErr != nil {
  523. return 0, 0, nameErr
  524. }
  525. id = toLowerASCII(id)
  526. if id == "odd" {
  527. return 2, 1, nil
  528. }
  529. if id == "even" {
  530. return 2, 0, nil
  531. }
  532. return 0, 0, fmt.Errorf("expected 'odd' or 'even', but found '%s' instead", id)
  533. default:
  534. goto invalid
  535. }
  536. positiveA:
  537. if p.i >= len(p.s) {
  538. goto eof
  539. }
  540. switch p.s[p.i] {
  541. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  542. a, err = p.parseInteger()
  543. if err != nil {
  544. return 0, 0, err
  545. }
  546. goto readA
  547. case 'n', 'N':
  548. a = 1
  549. p.i++
  550. goto readN
  551. default:
  552. goto invalid
  553. }
  554. negativeA:
  555. if p.i >= len(p.s) {
  556. goto eof
  557. }
  558. switch p.s[p.i] {
  559. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  560. a, err = p.parseInteger()
  561. if err != nil {
  562. return 0, 0, err
  563. }
  564. a = -a
  565. goto readA
  566. case 'n', 'N':
  567. a = -1
  568. p.i++
  569. goto readN
  570. default:
  571. goto invalid
  572. }
  573. readA:
  574. if p.i >= len(p.s) {
  575. goto eof
  576. }
  577. switch p.s[p.i] {
  578. case 'n', 'N':
  579. p.i++
  580. goto readN
  581. default:
  582. // The number we read as a is actually b.
  583. return 0, a, nil
  584. }
  585. readN:
  586. p.skipWhitespace()
  587. if p.i >= len(p.s) {
  588. goto eof
  589. }
  590. switch p.s[p.i] {
  591. case '+':
  592. p.i++
  593. p.skipWhitespace()
  594. b, err = p.parseInteger()
  595. if err != nil {
  596. return 0, 0, err
  597. }
  598. return a, b, nil
  599. case '-':
  600. p.i++
  601. p.skipWhitespace()
  602. b, err = p.parseInteger()
  603. if err != nil {
  604. return 0, 0, err
  605. }
  606. return a, -b, nil
  607. default:
  608. return a, 0, nil
  609. }
  610. eof:
  611. return 0, 0, errors.New("unexpected EOF while attempting to parse expression of form an+b")
  612. invalid:
  613. return 0, 0, errors.New("unexpected character while attempting to parse expression of form an+b")
  614. }
  615. // parseSimpleSelectorSequence parses a selector sequence that applies to
  616. // a single element.
  617. func (p *parser) parseSimpleSelectorSequence() (Sel, error) {
  618. var selectors []Sel
  619. if p.i >= len(p.s) {
  620. return nil, errors.New("expected selector, found EOF instead")
  621. }
  622. switch p.s[p.i] {
  623. case '*':
  624. // It's the universal selector. Just skip over it, since it doesn't affect the meaning.
  625. p.i++
  626. case '#', '.', '[', ':':
  627. // There's no type selector. Wait to process the other till the main loop.
  628. default:
  629. r, err := p.parseTypeSelector()
  630. if err != nil {
  631. return nil, err
  632. }
  633. selectors = append(selectors, r)
  634. }
  635. loop:
  636. for p.i < len(p.s) {
  637. var (
  638. ns Sel
  639. err error
  640. )
  641. switch p.s[p.i] {
  642. case '#':
  643. ns, err = p.parseIDSelector()
  644. case '.':
  645. ns, err = p.parseClassSelector()
  646. case '[':
  647. ns, err = p.parseAttributeSelector()
  648. case ':':
  649. ns, err = p.parsePseudoclassSelector()
  650. default:
  651. break loop
  652. }
  653. if err != nil {
  654. return nil, err
  655. }
  656. selectors = append(selectors, ns)
  657. }
  658. if len(selectors) == 1 { // no need wrap the selectors in compoundSelector
  659. return selectors[0], nil
  660. }
  661. return compoundSelector{selectors: selectors}, nil
  662. }
  663. // parseSelector parses a selector that may include combinators.
  664. func (p *parser) parseSelector() (Sel, error) {
  665. p.skipWhitespace()
  666. result, err := p.parseSimpleSelectorSequence()
  667. if err != nil {
  668. return nil, err
  669. }
  670. for {
  671. var (
  672. combinator byte
  673. c Sel
  674. )
  675. if p.skipWhitespace() {
  676. combinator = ' '
  677. }
  678. if p.i >= len(p.s) {
  679. return result, nil
  680. }
  681. switch p.s[p.i] {
  682. case '+', '>', '~':
  683. combinator = p.s[p.i]
  684. p.i++
  685. p.skipWhitespace()
  686. case ',', ')':
  687. // These characters can't begin a selector, but they can legally occur after one.
  688. return result, nil
  689. }
  690. if combinator == 0 {
  691. return result, nil
  692. }
  693. c, err = p.parseSimpleSelectorSequence()
  694. if err != nil {
  695. return nil, err
  696. }
  697. result = combinedSelector{first: result, combinator: combinator, second: c}
  698. }
  699. }
  700. // parseSelectorGroup parses a group of selectors, separated by commas.
  701. func (p *parser) parseSelectorGroup() (SelectorGroup, error) {
  702. current, err := p.parseSelector()
  703. if err != nil {
  704. return nil, err
  705. }
  706. result := SelectorGroup{current}
  707. for p.i < len(p.s) {
  708. if p.s[p.i] != ',' {
  709. break
  710. }
  711. p.i++
  712. c, err := p.parseSelector()
  713. if err != nil {
  714. return nil, err
  715. }
  716. result = append(result, c)
  717. }
  718. return result, nil
  719. }