You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

parser.go 17KB


  1. // Package cascadia is an implementation of CSS selectors.
  2. package cascadia
  3. import (
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. "golang.org/x/net/html"
  10. )
  11. // a parser for CSS selectors
  12. type parser struct {
  13. s string // the source text
  14. i int // the current position
  15. }
  16. // parseEscape parses a backslash escape.
  17. func (p *parser) parseEscape() (result string, err error) {
  18. if len(p.s) < p.i+2 || p.s[p.i] != '\\' {
  19. return "", errors.New("invalid escape sequence")
  20. }
  21. start := p.i + 1
  22. c := p.s[start]
  23. switch {
  24. case c == '\r' || c == '\n' || c == '\f':
  25. return "", errors.New("escaped line ending outside string")
  26. case hexDigit(c):
  27. // unicode escape (hex)
  28. var i int
  29. for i = start; i < p.i+6 && i < len(p.s) && hexDigit(p.s[i]); i++ {
  30. // empty
  31. }
  32. v, _ := strconv.ParseUint(p.s[start:i], 16, 21)
  33. if len(p.s) > i {
  34. switch p.s[i] {
  35. case '\r':
  36. i++
  37. if len(p.s) > i && p.s[i] == '\n' {
  38. i++
  39. }
  40. case ' ', '\t', '\n', '\f':
  41. i++
  42. }
  43. }
  44. p.i = i
  45. return string(rune(v)), nil
  46. }
  47. // Return the literal character after the backslash.
  48. result = p.s[start : start+1]
  49. p.i += 2
  50. return result, nil
  51. }
  52. func hexDigit(c byte) bool {
  53. return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
  54. }
  55. // nameStart returns whether c can be the first character of an identifier
  56. // (not counting an initial hyphen, or an escape sequence).
  57. func nameStart(c byte) bool {
  58. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127
  59. }
  60. // nameChar returns whether c can be a character within an identifier
  61. // (not counting an escape sequence).
  62. func nameChar(c byte) bool {
  63. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127 ||
  64. c == '-' || '0' <= c && c <= '9'
  65. }
  66. // parseIdentifier parses an identifier.
  67. func (p *parser) parseIdentifier() (result string, err error) {
  68. startingDash := false
  69. if len(p.s) > p.i && p.s[p.i] == '-' {
  70. startingDash = true
  71. p.i++
  72. }
  73. if len(p.s) <= p.i {
  74. return "", errors.New("expected identifier, found EOF instead")
  75. }
  76. if c := p.s[p.i]; !(nameStart(c) || c == '\\') {
  77. return "", fmt.Errorf("expected identifier, found %c instead", c)
  78. }
  79. result, err = p.parseName()
  80. if startingDash && err == nil {
  81. result = "-" + result
  82. }
  83. return
  84. }
  85. // parseName parses a name (which is like an identifier, but doesn't have
  86. // extra restrictions on the first character).
  87. func (p *parser) parseName() (result string, err error) {
  88. i := p.i
  89. loop:
  90. for i < len(p.s) {
  91. c := p.s[i]
  92. switch {
  93. case nameChar(c):
  94. start := i
  95. for i < len(p.s) && nameChar(p.s[i]) {
  96. i++
  97. }
  98. result += p.s[start:i]
  99. case c == '\\':
  100. p.i = i
  101. val, err := p.parseEscape()
  102. if err != nil {
  103. return "", err
  104. }
  105. i = p.i
  106. result += val
  107. default:
  108. break loop
  109. }
  110. }
  111. if result == "" {
  112. return "", errors.New("expected name, found EOF instead")
  113. }
  114. p.i = i
  115. return result, nil
  116. }
  117. // parseString parses a single- or double-quoted string.
  118. func (p *parser) parseString() (result string, err error) {
  119. i := p.i
  120. if len(p.s) < i+2 {
  121. return "", errors.New("expected string, found EOF instead")
  122. }
  123. quote := p.s[i]
  124. i++
  125. loop:
  126. for i < len(p.s) {
  127. switch p.s[i] {
  128. case '\\':
  129. if len(p.s) > i+1 {
  130. switch c := p.s[i+1]; c {
  131. case '\r':
  132. if len(p.s) > i+2 && p.s[i+2] == '\n' {
  133. i += 3
  134. continue loop
  135. }
  136. fallthrough
  137. case '\n', '\f':
  138. i += 2
  139. continue loop
  140. }
  141. }
  142. p.i = i
  143. val, err := p.parseEscape()
  144. if err != nil {
  145. return "", err
  146. }
  147. i = p.i
  148. result += val
  149. case quote:
  150. break loop
  151. case '\r', '\n', '\f':
  152. return "", errors.New("unexpected end of line in string")
  153. default:
  154. start := i
  155. for i < len(p.s) {
  156. if c := p.s[i]; c == quote || c == '\\' || c == '\r' || c == '\n' || c == '\f' {
  157. break
  158. }
  159. i++
  160. }
  161. result += p.s[start:i]
  162. }
  163. }
  164. if i >= len(p.s) {
  165. return "", errors.New("EOF in string")
  166. }
  167. // Consume the final quote.
  168. i++
  169. p.i = i
  170. return result, nil
  171. }
  172. // parseRegex parses a regular expression; the end is defined by encountering an
  173. // unmatched closing ')' or ']' which is not consumed
  174. func (p *parser) parseRegex() (rx *regexp.Regexp, err error) {
  175. i := p.i
  176. if len(p.s) < i+2 {
  177. return nil, errors.New("expected regular expression, found EOF instead")
  178. }
  179. // number of open parens or brackets;
  180. // when it becomes negative, finished parsing regex
  181. open := 0
  182. loop:
  183. for i < len(p.s) {
  184. switch p.s[i] {
  185. case '(', '[':
  186. open++
  187. case ')', ']':
  188. open--
  189. if open < 0 {
  190. break loop
  191. }
  192. }
  193. i++
  194. }
  195. if i >= len(p.s) {
  196. return nil, errors.New("EOF in regular expression")
  197. }
  198. rx, err = regexp.Compile(p.s[p.i:i])
  199. p.i = i
  200. return rx, err
  201. }
  202. // skipWhitespace consumes whitespace characters and comments.
  203. // It returns true if there was actually anything to skip.
  204. func (p *parser) skipWhitespace() bool {
  205. i := p.i
  206. for i < len(p.s) {
  207. switch p.s[i] {
  208. case ' ', '\t', '\r', '\n', '\f':
  209. i++
  210. continue
  211. case '/':
  212. if strings.HasPrefix(p.s[i:], "/*") {
  213. end := strings.Index(p.s[i+len("/*"):], "*/")
  214. if end != -1 {
  215. i += end + len("/**/")
  216. continue
  217. }
  218. }
  219. }
  220. break
  221. }
  222. if i > p.i {
  223. p.i = i
  224. return true
  225. }
  226. return false
  227. }
  228. // consumeParenthesis consumes an opening parenthesis and any following
  229. // whitespace. It returns true if there was actually a parenthesis to skip.
  230. func (p *parser) consumeParenthesis() bool {
  231. if p.i < len(p.s) && p.s[p.i] == '(' {
  232. p.i++
  233. p.skipWhitespace()
  234. return true
  235. }
  236. return false
  237. }
  238. // consumeClosingParenthesis consumes a closing parenthesis and any preceding
  239. // whitespace. It returns true if there was actually a parenthesis to skip.
  240. func (p *parser) consumeClosingParenthesis() bool {
  241. i := p.i
  242. p.skipWhitespace()
  243. if p.i < len(p.s) && p.s[p.i] == ')' {
  244. p.i++
  245. return true
  246. }
  247. p.i = i
  248. return false
  249. }
  250. // parseTypeSelector parses a type selector (one that matches by tag name).
  251. func (p *parser) parseTypeSelector() (result Selector, err error) {
  252. tag, err := p.parseIdentifier()
  253. if err != nil {
  254. return nil, err
  255. }
  256. return typeSelector(tag), nil
  257. }
  258. // parseIDSelector parses a selector that matches by id attribute.
  259. func (p *parser) parseIDSelector() (Selector, error) {
  260. if p.i >= len(p.s) {
  261. return nil, fmt.Errorf("expected id selector (#id), found EOF instead")
  262. }
  263. if p.s[p.i] != '#' {
  264. return nil, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i])
  265. }
  266. p.i++
  267. id, err := p.parseName()
  268. if err != nil {
  269. return nil, err
  270. }
  271. return attributeEqualsSelector("id", id), nil
  272. }
  273. // parseClassSelector parses a selector that matches by class attribute.
  274. func (p *parser) parseClassSelector() (Selector, error) {
  275. if p.i >= len(p.s) {
  276. return nil, fmt.Errorf("expected class selector (.class), found EOF instead")
  277. }
  278. if p.s[p.i] != '.' {
  279. return nil, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i])
  280. }
  281. p.i++
  282. class, err := p.parseIdentifier()
  283. if err != nil {
  284. return nil, err
  285. }
  286. return attributeIncludesSelector("class", class), nil
  287. }
  288. // parseAttributeSelector parses a selector that matches by attribute value.
  289. func (p *parser) parseAttributeSelector() (Selector, error) {
  290. if p.i >= len(p.s) {
  291. return nil, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead")
  292. }
  293. if p.s[p.i] != '[' {
  294. return nil, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i])
  295. }
  296. p.i++
  297. p.skipWhitespace()
  298. key, err := p.parseIdentifier()
  299. if err != nil {
  300. return nil, err
  301. }
  302. p.skipWhitespace()
  303. if p.i >= len(p.s) {
  304. return nil, errors.New("unexpected EOF in attribute selector")
  305. }
  306. if p.s[p.i] == ']' {
  307. p.i++
  308. return attributeExistsSelector(key), nil
  309. }
  310. if p.i+2 >= len(p.s) {
  311. return nil, errors.New("unexpected EOF in attribute selector")
  312. }
  313. op := p.s[p.i : p.i+2]
  314. if op[0] == '=' {
  315. op = "="
  316. } else if op[1] != '=' {
  317. return nil, fmt.Errorf(`expected equality operator, found "%s" instead`, op)
  318. }
  319. p.i += len(op)
  320. p.skipWhitespace()
  321. if p.i >= len(p.s) {
  322. return nil, errors.New("unexpected EOF in attribute selector")
  323. }
  324. var val string
  325. var rx *regexp.Regexp
  326. if op == "#=" {
  327. rx, err = p.parseRegex()
  328. } else {
  329. switch p.s[p.i] {
  330. case '\'', '"':
  331. val, err = p.parseString()
  332. default:
  333. val, err = p.parseIdentifier()
  334. }
  335. }
  336. if err != nil {
  337. return nil, err
  338. }
  339. p.skipWhitespace()
  340. if p.i >= len(p.s) {
  341. return nil, errors.New("unexpected EOF in attribute selector")
  342. }
  343. if p.s[p.i] != ']' {
  344. return nil, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i])
  345. }
  346. p.i++
  347. switch op {
  348. case "=":
  349. return attributeEqualsSelector(key, val), nil
  350. case "!=":
  351. return attributeNotEqualSelector(key, val), nil
  352. case "~=":
  353. return attributeIncludesSelector(key, val), nil
  354. case "|=":
  355. return attributeDashmatchSelector(key, val), nil
  356. case "^=":
  357. return attributePrefixSelector(key, val), nil
  358. case "$=":
  359. return attributeSuffixSelector(key, val), nil
  360. case "*=":
  361. return attributeSubstringSelector(key, val), nil
  362. case "#=":
  363. return attributeRegexSelector(key, rx), nil
  364. }
  365. return nil, fmt.Errorf("attribute operator %q is not supported", op)
  366. }
  367. var errExpectedParenthesis = errors.New("expected '(' but didn't find it")
  368. var errExpectedClosingParenthesis = errors.New("expected ')' but didn't find it")
  369. var errUnmatchedParenthesis = errors.New("unmatched '('")
  370. // parsePseudoclassSelector parses a pseudoclass selector like :not(p).
  371. func (p *parser) parsePseudoclassSelector() (Selector, error) {
  372. if p.i >= len(p.s) {
  373. return nil, fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead")
  374. }
  375. if p.s[p.i] != ':' {
  376. return nil, fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i])
  377. }
  378. p.i++
  379. name, err := p.parseIdentifier()
  380. if err != nil {
  381. return nil, err
  382. }
  383. name = toLowerASCII(name)
  384. switch name {
  385. case "not", "has", "haschild":
  386. if !p.consumeParenthesis() {
  387. return nil, errExpectedParenthesis
  388. }
  389. sel, parseErr := p.parseSelectorGroup()
  390. if parseErr != nil {
  391. return nil, parseErr
  392. }
  393. if !p.consumeClosingParenthesis() {
  394. return nil, errExpectedClosingParenthesis
  395. }
  396. switch name {
  397. case "not":
  398. return negatedSelector(sel), nil
  399. case "has":
  400. return hasDescendantSelector(sel), nil
  401. case "haschild":
  402. return hasChildSelector(sel), nil
  403. }
  404. case "contains", "containsown":
  405. if !p.consumeParenthesis() {
  406. return nil, errExpectedParenthesis
  407. }
  408. if p.i == len(p.s) {
  409. return nil, errUnmatchedParenthesis
  410. }
  411. var val string
  412. switch p.s[p.i] {
  413. case '\'', '"':
  414. val, err = p.parseString()
  415. default:
  416. val, err = p.parseIdentifier()
  417. }
  418. if err != nil {
  419. return nil, err
  420. }
  421. val = strings.ToLower(val)
  422. p.skipWhitespace()
  423. if p.i >= len(p.s) {
  424. return nil, errors.New("unexpected EOF in pseudo selector")
  425. }
  426. if !p.consumeClosingParenthesis() {
  427. return nil, errExpectedClosingParenthesis
  428. }
  429. switch name {
  430. case "contains":
  431. return textSubstrSelector(val), nil
  432. case "containsown":
  433. return ownTextSubstrSelector(val), nil
  434. }
  435. case "matches", "matchesown":
  436. if !p.consumeParenthesis() {
  437. return nil, errExpectedParenthesis
  438. }
  439. rx, err := p.parseRegex()
  440. if err != nil {
  441. return nil, err
  442. }
  443. if p.i >= len(p.s) {
  444. return nil, errors.New("unexpected EOF in pseudo selector")
  445. }
  446. if !p.consumeClosingParenthesis() {
  447. return nil, errExpectedClosingParenthesis
  448. }
  449. switch name {
  450. case "matches":
  451. return textRegexSelector(rx), nil
  452. case "matchesown":
  453. return ownTextRegexSelector(rx), nil
  454. }
  455. case "nth-child", "nth-last-child", "nth-of-type", "nth-last-of-type":
  456. if !p.consumeParenthesis() {
  457. return nil, errExpectedParenthesis
  458. }
  459. a, b, err := p.parseNth()
  460. if err != nil {
  461. return nil, err
  462. }
  463. if !p.consumeClosingParenthesis() {
  464. return nil, errExpectedClosingParenthesis
  465. }
  466. if a == 0 {
  467. switch name {
  468. case "nth-child":
  469. return simpleNthChildSelector(b, false), nil
  470. case "nth-of-type":
  471. return simpleNthChildSelector(b, true), nil
  472. case "nth-last-child":
  473. return simpleNthLastChildSelector(b, false), nil
  474. case "nth-last-of-type":
  475. return simpleNthLastChildSelector(b, true), nil
  476. }
  477. }
  478. return nthChildSelector(a, b,
  479. name == "nth-last-child" || name == "nth-last-of-type",
  480. name == "nth-of-type" || name == "nth-last-of-type"),
  481. nil
  482. case "first-child":
  483. return simpleNthChildSelector(1, false), nil
  484. case "last-child":
  485. return simpleNthLastChildSelector(1, false), nil
  486. case "first-of-type":
  487. return simpleNthChildSelector(1, true), nil
  488. case "last-of-type":
  489. return simpleNthLastChildSelector(1, true), nil
  490. case "only-child":
  491. return onlyChildSelector(false), nil
  492. case "only-of-type":
  493. return onlyChildSelector(true), nil
  494. case "input":
  495. return inputSelector, nil
  496. case "empty":
  497. return emptyElementSelector, nil
  498. case "root":
  499. return rootSelector, nil
  500. }
  501. return nil, fmt.Errorf("unknown pseudoclass :%s", name)
  502. }
  503. // parseInteger parses a decimal integer.
  504. func (p *parser) parseInteger() (int, error) {
  505. i := p.i
  506. start := i
  507. for i < len(p.s) && '0' <= p.s[i] && p.s[i] <= '9' {
  508. i++
  509. }
  510. if i == start {
  511. return 0, errors.New("expected integer, but didn't find it")
  512. }
  513. p.i = i
  514. val, err := strconv.Atoi(p.s[start:i])
  515. if err != nil {
  516. return 0, err
  517. }
  518. return val, nil
  519. }
  520. // parseNth parses the argument for :nth-child (normally of the form an+b).
  521. func (p *parser) parseNth() (a, b int, err error) {
  522. // initial state
  523. if p.i >= len(p.s) {
  524. goto eof
  525. }
  526. switch p.s[p.i] {
  527. case '-':
  528. p.i++
  529. goto negativeA
  530. case '+':
  531. p.i++
  532. goto positiveA
  533. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  534. goto positiveA
  535. case 'n', 'N':
  536. a = 1
  537. p.i++
  538. goto readN
  539. case 'o', 'O', 'e', 'E':
  540. id, nameErr := p.parseName()
  541. if nameErr != nil {
  542. return 0, 0, nameErr
  543. }
  544. id = toLowerASCII(id)
  545. if id == "odd" {
  546. return 2, 1, nil
  547. }
  548. if id == "even" {
  549. return 2, 0, nil
  550. }
  551. return 0, 0, fmt.Errorf("expected 'odd' or 'even', but found '%s' instead", id)
  552. default:
  553. goto invalid
  554. }
  555. positiveA:
  556. if p.i >= len(p.s) {
  557. goto eof
  558. }
  559. switch p.s[p.i] {
  560. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  561. a, err = p.parseInteger()
  562. if err != nil {
  563. return 0, 0, err
  564. }
  565. goto readA
  566. case 'n', 'N':
  567. a = 1
  568. p.i++
  569. goto readN
  570. default:
  571. goto invalid
  572. }
  573. negativeA:
  574. if p.i >= len(p.s) {
  575. goto eof
  576. }
  577. switch p.s[p.i] {
  578. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  579. a, err = p.parseInteger()
  580. if err != nil {
  581. return 0, 0, err
  582. }
  583. a = -a
  584. goto readA
  585. case 'n', 'N':
  586. a = -1
  587. p.i++
  588. goto readN
  589. default:
  590. goto invalid
  591. }
  592. readA:
  593. if p.i >= len(p.s) {
  594. goto eof
  595. }
  596. switch p.s[p.i] {
  597. case 'n', 'N':
  598. p.i++
  599. goto readN
  600. default:
  601. // The number we read as a is actually b.
  602. return 0, a, nil
  603. }
  604. readN:
  605. p.skipWhitespace()
  606. if p.i >= len(p.s) {
  607. goto eof
  608. }
  609. switch p.s[p.i] {
  610. case '+':
  611. p.i++
  612. p.skipWhitespace()
  613. b, err = p.parseInteger()
  614. if err != nil {
  615. return 0, 0, err
  616. }
  617. return a, b, nil
  618. case '-':
  619. p.i++
  620. p.skipWhitespace()
  621. b, err = p.parseInteger()
  622. if err != nil {
  623. return 0, 0, err
  624. }
  625. return a, -b, nil
  626. default:
  627. return a, 0, nil
  628. }
  629. eof:
  630. return 0, 0, errors.New("unexpected EOF while attempting to parse expression of form an+b")
  631. invalid:
  632. return 0, 0, errors.New("unexpected character while attempting to parse expression of form an+b")
  633. }
  634. // parseSimpleSelectorSequence parses a selector sequence that applies to
  635. // a single element.
  636. func (p *parser) parseSimpleSelectorSequence() (Selector, error) {
  637. var result Selector
  638. if p.i >= len(p.s) {
  639. return nil, errors.New("expected selector, found EOF instead")
  640. }
  641. switch p.s[p.i] {
  642. case '*':
  643. // It's the universal selector. Just skip over it, since it doesn't affect the meaning.
  644. p.i++
  645. case '#', '.', '[', ':':
  646. // There's no type selector. Wait to process the other till the main loop.
  647. default:
  648. r, err := p.parseTypeSelector()
  649. if err != nil {
  650. return nil, err
  651. }
  652. result = r
  653. }
  654. loop:
  655. for p.i < len(p.s) {
  656. var ns Selector
  657. var err error
  658. switch p.s[p.i] {
  659. case '#':
  660. ns, err = p.parseIDSelector()
  661. case '.':
  662. ns, err = p.parseClassSelector()
  663. case '[':
  664. ns, err = p.parseAttributeSelector()
  665. case ':':
  666. ns, err = p.parsePseudoclassSelector()
  667. default:
  668. break loop
  669. }
  670. if err != nil {
  671. return nil, err
  672. }
  673. if result == nil {
  674. result = ns
  675. } else {
  676. result = intersectionSelector(result, ns)
  677. }
  678. }
  679. if result == nil {
  680. result = func(n *html.Node) bool {
  681. return n.Type == html.ElementNode
  682. }
  683. }
  684. return result, nil
  685. }
  686. // parseSelector parses a selector that may include combinators.
  687. func (p *parser) parseSelector() (result Selector, err error) {
  688. p.skipWhitespace()
  689. result, err = p.parseSimpleSelectorSequence()
  690. if err != nil {
  691. return
  692. }
  693. for {
  694. var combinator byte
  695. if p.skipWhitespace() {
  696. combinator = ' '
  697. }
  698. if p.i >= len(p.s) {
  699. return
  700. }
  701. switch p.s[p.i] {
  702. case '+', '>', '~':
  703. combinator = p.s[p.i]
  704. p.i++
  705. p.skipWhitespace()
  706. case ',', ')':
  707. // These characters can't begin a selector, but they can legally occur after one.
  708. return
  709. }
  710. if combinator == 0 {
  711. return
  712. }
  713. c, err := p.parseSimpleSelectorSequence()
  714. if err != nil {
  715. return nil, err
  716. }
  717. switch combinator {
  718. case ' ':
  719. result = descendantSelector(result, c)
  720. case '>':
  721. result = childSelector(result, c)
  722. case '+':
  723. result = siblingSelector(result, c, true)
  724. case '~':
  725. result = siblingSelector(result, c, false)
  726. }
  727. }
  728. panic("unreachable")
  729. }
  730. // parseSelectorGroup parses a group of selectors, separated by commas.
  731. func (p *parser) parseSelectorGroup() (result Selector, err error) {
  732. result, err = p.parseSelector()
  733. if err != nil {
  734. return
  735. }
  736. for p.i < len(p.s) {
  737. if p.s[p.i] != ',' {
  738. return result, nil
  739. }
  740. p.i++
  741. c, err := p.parseSelector()
  742. if err != nil {
  743. return nil, err
  744. }
  745. result = unionSelector(result, c)
  746. }
  747. return
  748. }