summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/parser/TokenQueue.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/parser/TokenQueue.java')
-rw-r--r--server/src/org/jsoup/parser/TokenQueue.java393
1 files changed, 393 insertions, 0 deletions
diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java
new file mode 100644
index 0000000000..a2fdfe621a
--- /dev/null
+++ b/server/src/org/jsoup/parser/TokenQueue.java
@@ -0,0 +1,393 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+
+/**
+ * A character queue with parsing helpers.
+ *
+ * @author Jonathan Hedley
+ */
+public class TokenQueue {
+ private String queue;
+ private int pos = 0;
+
+ private static final char ESC = '\\'; // escape char for chomp balanced.
+
+ /**
+ Create a new TokenQueue.
+ @param data string of data to back queue.
+ */
+ public TokenQueue(String data) {
+ Validate.notNull(data);
+ queue = data;
+ }
+
+ /**
+ * Is the queue empty?
+ * @return true if no data left in queue.
+ */
+ public boolean isEmpty() {
+ return remainingLength() == 0;
+ }
+
+ private int remainingLength() {
+ return queue.length() - pos;
+ }
+
+ /**
+ * Retrieves but does not remove the first character from the queue.
+ * @return First character, or 0 if empty.
+ */
+ public char peek() {
+ return isEmpty() ? 0 : queue.charAt(pos);
+ }
+
+ /**
+ Add a character to the start of the queue (will be the next character retrieved).
+ @param c character to add
+ */
+ public void addFirst(Character c) {
+ addFirst(c.toString());
+ }
+
+ /**
+ Add a string to the start of the queue.
+ @param seq string to add.
+ */
+ public void addFirst(String seq) {
+ // not very performant, but an edge case
+ queue = seq + queue.substring(pos);
+ pos = 0;
+ }
+
+ /**
+ * Tests if the next characters on the queue match the sequence. Case insensitive.
+ * @param seq String to check queue for.
+ * @return true if the next characters match.
+ */
+ public boolean matches(String seq) {
+ return queue.regionMatches(true, pos, seq, 0, seq.length());
+ }
+
+ /**
+ * Case sensitive match test.
+ * @param seq string to case sensitively check for
+ * @return true if matched, false if not
+ */
+ public boolean matchesCS(String seq) {
+ return queue.startsWith(seq, pos);
+ }
+
+
+ /**
+ Tests if the next characters match any of the sequences. Case insensitive.
+ @param seq list of strings to case insensitively check for
+ @return true of any matched, false if none did
+ */
+ public boolean matchesAny(String... seq) {
+ for (String s : seq) {
+ if (matches(s))
+ return true;
+ }
+ return false;
+ }
+
+ public boolean matchesAny(char... seq) {
+ if (isEmpty())
+ return false;
+
+ for (char c: seq) {
+ if (queue.charAt(pos) == c)
+ return true;
+ }
+ return false;
+ }
+
+ public boolean matchesStartTag() {
+ // micro opt for matching "<x"
+ return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1)));
+ }
+
+ /**
+ * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
+ * queue.
+ * @param seq String to search for, and if found, remove from queue.
+ * @return true if found and removed, false if not found.
+ */
+ public boolean matchChomp(String seq) {
+ if (matches(seq)) {
+ pos += seq.length();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ Tests if queue starts with a whitespace character.
+ @return if starts with whitespace
+ */
+ public boolean matchesWhitespace() {
+ return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
+ }
+
+ /**
+ Test if the queue matches a word character (letter or digit).
+ @return if matches a word character
+ */
+ public boolean matchesWord() {
+ return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
+ }
+
+ /**
+ * Drops the next character off the queue.
+ */
+ public void advance() {
+ if (!isEmpty()) pos++;
+ }
+
+ /**
+ * Consume one character off queue.
+ * @return first character on queue.
+ */
+ public char consume() {
+ return queue.charAt(pos++);
+ }
+
+ /**
+ * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
+ * throw an illegal state exception -- but you should be running match() against that condition.
+ <p>
+ Case insensitive.
+ * @param seq sequence to remove from head of queue.
+ */
+ public void consume(String seq) {
+ if (!matches(seq))
+ throw new IllegalStateException("Queue did not match expected sequence");
+ int len = seq.length();
+ if (len > remainingLength())
+ throw new IllegalStateException("Queue not long enough to consume sequence");
+
+ pos += len;
+ }
+
+ /**
+ * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
+ * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
+ * @return The matched data consumed from queue.
+ */
+ public String consumeTo(String seq) {
+ int offset = queue.indexOf(seq, pos);
+ if (offset != -1) {
+ String consumed = queue.substring(pos, offset);
+ pos += consumed.length();
+ return consumed;
+ } else {
+ return remainder();
+ }
+ }
+
+ public String consumeToIgnoreCase(String seq) {
+ int start = pos;
+ String first = seq.substring(0, 1);
+ boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
+ while (!isEmpty()) {
+ if (matches(seq))
+ break;
+
+ if (canScan) {
+ int skip = queue.indexOf(first, pos) - pos;
+ if (skip == 0) // this char is the skip char, but not match, so force advance of pos
+ pos++;
+ else if (skip < 0) // no chance of finding, grab to end
+ pos = queue.length();
+ else
+ pos += skip;
+ }
+ else
+ pos++;
+ }
+
+ String data = queue.substring(start, pos);
+ return data;
+ }
+
+ /**
+ Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
+ @param seq any number of terminators to consume to. <b>Case insensitive.</b>
+ @return consumed string
+ */
+ // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
+ // is is a case sensitive time...
+ public String consumeToAny(String... seq) {
+ int start = pos;
+ while (!isEmpty() && !matchesAny(seq)) {
+ pos++;
+ }
+
+ String data = queue.substring(start, pos);
+ return data;
+ }
+
+ /**
+ * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
+ * <p>
+ * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
+ * isEmpty() == true).
+ * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
+ * @return Data matched from queue.
+ */
+ public String chompTo(String seq) {
+ String data = consumeTo(seq);
+ matchChomp(seq);
+ return data;
+ }
+
+ public String chompToIgnoreCase(String seq) {
+ String data = consumeToIgnoreCase(seq); // case insensitive scan
+ matchChomp(seq);
+ return data;
+ }
+
+ /**
+ * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
+ * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left
+ * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
+ * contains text strings; use unescape for that.
+ * @param open opener
+ * @param close closer
+ * @return data matched from the queue
+ */
+ public String chompBalanced(char open, char close) {
+ StringBuilder accum = new StringBuilder();
+ int depth = 0;
+ char last = 0;
+
+ do {
+ if (isEmpty()) break;
+ Character c = consume();
+ if (last == 0 || last != ESC) {
+ if (c.equals(open))
+ depth++;
+ else if (c.equals(close))
+ depth--;
+ }
+
+ if (depth > 0 && last != 0)
+ accum.append(c); // don't include the outer match pair in the return
+ last = c;
+ } while (depth > 0);
+ return accum.toString();
+ }
+
+ /**
+ * Unescaped a \ escaped string.
+ * @param in backslash escaped string
+ * @return unescaped string
+ */
+ public static String unescape(String in) {
+ StringBuilder out = new StringBuilder();
+ char last = 0;
+ for (char c : in.toCharArray()) {
+ if (c == ESC) {
+ if (last != 0 && last == ESC)
+ out.append(c);
+ }
+ else
+ out.append(c);
+ last = c;
+ }
+ return out.toString();
+ }
+
+ /**
+ * Pulls the next run of whitespace characters of the queue.
+ */
+ public boolean consumeWhitespace() {
+ boolean seen = false;
+ while (matchesWhitespace()) {
+ pos++;
+ seen = true;
+ }
+ return seen;
+ }
+
+ /**
+ * Retrieves the next run of word type (letter or digit) off the queue.
+ * @return String of word characters from queue, or empty string if none.
+ */
+ public String consumeWord() {
+ int start = pos;
+ while (matchesWord())
+ pos++;
+ return queue.substring(start, pos);
+ }
+
+ /**
+ * Consume an tag name off the queue (word or :, _, -)
+ *
+ * @return tag name
+ */
+ public String consumeTagName() {
+ int start = pos;
+ while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-')))
+ pos++;
+
+ return queue.substring(start, pos);
+ }
+
+ /**
+ * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects).
+ *
+ * @return tag name
+ */
+ public String consumeElementSelector() {
+ int start = pos;
+ while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-')))
+ pos++;
+
+ return queue.substring(start, pos);
+ }
+
+ /**
+ Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
+ http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
+ @return identifier
+ */
+ public String consumeCssIdentifier() {
+ int start = pos;
+ while (!isEmpty() && (matchesWord() || matchesAny('-', '_')))
+ pos++;
+
+ return queue.substring(start, pos);
+ }
+
+ /**
+ Consume an attribute key off the queue (letter, digit, -, _, :")
+ @return attribute key
+ */
+ public String consumeAttributeKey() {
+ int start = pos;
+ while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':')))
+ pos++;
+
+ return queue.substring(start, pos);
+ }
+
+ /**
+ Consume and return whatever is left on the queue.
+ @return remained of queue.
+ */
+ public String remainder() {
+ StringBuilder accum = new StringBuilder();
+ while (!isEmpty()) {
+ accum.append(consume());
+ }
+ return accum.toString();
+ }
+
+ public String toString() {
+ return queue.substring(pos);
+ }
+}