diff options
Diffstat (limited to 'server/src/org/jsoup/select/QueryParser.java')
-rw-r--r-- | server/src/org/jsoup/select/QueryParser.java | 293 |
1 files changed, 293 insertions, 0 deletions
diff --git a/server/src/org/jsoup/select/QueryParser.java b/server/src/org/jsoup/select/QueryParser.java new file mode 100644 index 0000000000..d3cc36f91c --- /dev/null +++ b/server/src/org/jsoup/select/QueryParser.java @@ -0,0 +1,293 @@ +package org.jsoup.select; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.parser.TokenQueue; + +/** + * Parses a CSS selector into an Evaluator tree. + */ +class QueryParser { + private final static String[] combinators = {",", ">", "+", "~", " "}; + + private TokenQueue tq; + private String query; + private List<Evaluator> evals = new ArrayList<Evaluator>(); + + /** + * Create a new QueryParser. + * @param query CSS query + */ + private QueryParser(String query) { + this.query = query; + this.tq = new TokenQueue(query); + } + + /** + * Parse a CSS query into an Evaluator. + * @param query CSS query + * @return Evaluator + */ + public static Evaluator parse(String query) { + QueryParser p = new QueryParser(query); + return p.parse(); + } + + /** + * Parse the query + * @return Evaluator + */ + Evaluator parse() { + tq.consumeWhitespace(); + + if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements + evals.add(new StructuralEvaluator.Root()); + combinator(tq.consume()); + } else { + findElements(); + } + + while (!tq.isEmpty()) { + // hierarchy and extras + boolean seenWhite = tq.consumeWhitespace(); + + if (tq.matchesAny(combinators)) { + combinator(tq.consume()); + } else if (seenWhite) { + combinator(' '); + } else { // E.class, E#id, E[attr] etc. AND + findElements(); // take next el, #. etc off queue + } + } + + if (evals.size() == 1) + return evals.get(0); + + return new CombiningEvaluator.And(evals); + } + + private void combinator(char combinator) { + tq.consumeWhitespace(); + String subQuery = consumeSubQuery(); // support multi > childs + + Evaluator rootEval; // the new topmost evaluator + Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. + Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator + boolean replaceRightMost = false; + + if (evals.size() == 1) { + rootEval = currentEval = evals.get(0); + // make sure OR (,) has precedence: + if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { + currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); + replaceRightMost = true; + } + } + else { + rootEval = currentEval = new CombiningEvaluator.And(evals); + } + evals.clear(); + + // for most combinators: change the current eval into an AND of the current eval and the new eval + if (combinator == '>') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval)); + else if (combinator == ' ') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval)); + else if (combinator == '+') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval)); + else if (combinator == '~') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval)); + else if (combinator == ',') { // group or. + CombiningEvaluator.Or or; + if (currentEval instanceof CombiningEvaluator.Or) { + or = (CombiningEvaluator.Or) currentEval; + or.add(newEval); + } else { + or = new CombiningEvaluator.Or(); + or.add(currentEval); + or.add(newEval); + } + currentEval = or; + } + else + throw new Selector.SelectorParseException("Unknown combinator: " + combinator); + + if (replaceRightMost) + ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); + else rootEval = currentEval; + evals.add(rootEval); + } + + private String consumeSubQuery() { + StringBuilder sq = new StringBuilder(); + while (!tq.isEmpty()) { + if (tq.matches("(")) + sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); + else if (tq.matches("[")) + sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); + else if (tq.matchesAny(combinators)) + break; + else + sq.append(tq.consume()); + } + return sq.toString(); + } + + private void findElements() { + if (tq.matchChomp("#")) + byId(); + else if (tq.matchChomp(".")) + byClass(); + else if (tq.matchesWord()) + byTag(); + else if (tq.matches("[")) + byAttribute(); + else if (tq.matchChomp("*")) + allElements(); + else if (tq.matchChomp(":lt(")) + indexLessThan(); + else if (tq.matchChomp(":gt(")) + indexGreaterThan(); + else if (tq.matchChomp(":eq(")) + indexEquals(); + else if (tq.matches(":has(")) + has(); + else if (tq.matches(":contains(")) + contains(false); + else if (tq.matches(":containsOwn(")) + contains(true); + else if (tq.matches(":matches(")) + matches(false); + else if (tq.matches(":matchesOwn(")) + matches(true); + else if (tq.matches(":not(")) + not(); + else // unhandled + throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); + + } + + private void byId() { + String id = tq.consumeCssIdentifier(); + Validate.notEmpty(id); + evals.add(new Evaluator.Id(id)); + } + + private void byClass() { + String className = tq.consumeCssIdentifier(); + Validate.notEmpty(className); + evals.add(new Evaluator.Class(className.trim().toLowerCase())); + } + + private void byTag() { + String tagName = tq.consumeElementSelector(); + Validate.notEmpty(tagName); + + // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: + if (tagName.contains("|")) + tagName = tagName.replace("|", ":"); + + evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); + } + + private void byAttribute() { + TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue + String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val) + Validate.notEmpty(key); + cq.consumeWhitespace(); + + if (cq.isEmpty()) { + if (key.startsWith("^")) + evals.add(new Evaluator.AttributeStarting(key.substring(1))); + else + evals.add(new Evaluator.Attribute(key)); + } else { + if (cq.matchChomp("=")) + evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); + + else if (cq.matchChomp("!=")) + evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder())); + + else if (cq.matchChomp("^=")) + evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder())); + + else if (cq.matchChomp("$=")) + evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder())); + + else if (cq.matchChomp("*=")) + evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder())); + + else if (cq.matchChomp("~=")) + evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()))); + else + throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); + } + } + + private void allElements() { + evals.add(new Evaluator.AllElements()); + } + + // pseudo selectors :lt, :gt, :eq + private void indexLessThan() { + evals.add(new Evaluator.IndexLessThan(consumeIndex())); + } + + private void indexGreaterThan() { + evals.add(new Evaluator.IndexGreaterThan(consumeIndex())); + } + + private void indexEquals() { + evals.add(new Evaluator.IndexEquals(consumeIndex())); + } + + private int consumeIndex() { + String indexS = tq.chompTo(")").trim(); + Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric"); + return Integer.parseInt(indexS); + } + + // pseudo selector :has(el) + private void has() { + tq.consume(":has"); + String subQuery = tq.chompBalanced('(', ')'); + Validate.notEmpty(subQuery, ":has(el) subselect must not be empty"); + evals.add(new StructuralEvaluator.Has(parse(subQuery))); + } + + // pseudo selector :contains(text), containsOwn(text) + private void contains(boolean own) { + tq.consume(own ? ":containsOwn" : ":contains"); + String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); + Validate.notEmpty(searchText, ":contains(text) query must not be empty"); + if (own) + evals.add(new Evaluator.ContainsOwnText(searchText)); + else + evals.add(new Evaluator.ContainsText(searchText)); + } + + // :matches(regex), matchesOwn(regex) + private void matches(boolean own) { + tq.consume(own ? ":matchesOwn" : ":matches"); + String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped + Validate.notEmpty(regex, ":matches(regex) query must not be empty"); + + if (own) + evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); + else + evals.add(new Evaluator.Matches(Pattern.compile(regex))); + } + + // :not(selector) + private void not() { + tq.consume(":not"); + String subQuery = tq.chompBalanced('(', ')'); + Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); + + evals.add(new StructuralEvaluator.Not(parse(subQuery))); + } +} |