diff options
Diffstat (limited to 'server/src/org/jsoup/parser/TokenQueue.java')
-rw-r--r-- | server/src/org/jsoup/parser/TokenQueue.java | 274 |
1 files changed, 177 insertions, 97 deletions
diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java index a2fdfe621a..3e7127e640 100644 --- a/server/src/org/jsoup/parser/TokenQueue.java +++ b/server/src/org/jsoup/parser/TokenQueue.java @@ -5,18 +5,20 @@ import org.jsoup.helper.Validate; /** * A character queue with parsing helpers. - * + * * @author Jonathan Hedley */ public class TokenQueue { private String queue; private int pos = 0; - + private static final char ESC = '\\'; // escape char for chomp balanced. /** - Create a new TokenQueue. - @param data string of data to back queue. + * Create a new TokenQueue. + * + * @param data + * string of data to back queue. */ public TokenQueue(String data) { Validate.notNull(data); @@ -25,18 +27,20 @@ public class TokenQueue { /** * Is the queue empty? + * * @return true if no data left in queue. */ public boolean isEmpty() { return remainingLength() == 0; } - + private int remainingLength() { return queue.length() - pos; } /** * Retrieves but does not remove the first character from the queue. + * * @return First character, or 0 if empty. */ public char peek() { @@ -44,16 +48,21 @@ public class TokenQueue { } /** - Add a character to the start of the queue (will be the next character retrieved). - @param c character to add + * Add a character to the start of the queue (will be the next character + * retrieved). + * + * @param c + * character to add */ public void addFirst(Character c) { addFirst(c.toString()); } /** - Add a string to the start of the queue. - @param seq string to add. + * Add a string to the start of the queue. + * + * @param seq + * string to add. */ public void addFirst(String seq) { // not very performant, but an edge case @@ -62,8 +71,11 @@ public class TokenQueue { } /** - * Tests if the next characters on the queue match the sequence. Case insensitive. - * @param seq String to check queue for. + * Tests if the next characters on the queue match the sequence. Case + * insensitive. + * + * @param seq + * String to check queue for. * @return true if the next characters match. */ public boolean matches(String seq) { @@ -72,47 +84,57 @@ public class TokenQueue { /** * Case sensitive match test. - * @param seq string to case sensitively check for + * + * @param seq + * string to case sensitively check for * @return true if matched, false if not */ public boolean matchesCS(String seq) { return queue.startsWith(seq, pos); } - /** - Tests if the next characters match any of the sequences. Case insensitive. - @param seq list of strings to case insensitively check for - @return true of any matched, false if none did + * Tests if the next characters match any of the sequences. Case + * insensitive. + * + * @param seq + * list of strings to case insensitively check for + * @return true of any matched, false if none did */ public boolean matchesAny(String... seq) { for (String s : seq) { - if (matches(s)) + if (matches(s)) { return true; + } } return false; } public boolean matchesAny(char... seq) { - if (isEmpty()) + if (isEmpty()) { return false; + } - for (char c: seq) { - if (queue.charAt(pos) == c) + for (char c : seq) { + if (queue.charAt(pos) == c) { return true; + } } return false; } public boolean matchesStartTag() { // micro opt for matching "<x" - return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); + return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character + .isLetter(queue.charAt(pos + 1))); } /** - * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the - * queue. - * @param seq String to search for, and if found, remove from queue. + * Tests if the queue matches the sequence (as with match), and if they do, + * removes the matched string from the queue. + * + * @param seq + * String to search for, and if found, remove from queue. * @return true if found and removed, false if not found. */ public boolean matchChomp(String seq) { @@ -125,16 +147,18 @@ public class TokenQueue { } /** - Tests if queue starts with a whitespace character. - @return if starts with whitespace + * Tests if queue starts with a whitespace character. + * + * @return if starts with whitespace */ public boolean matchesWhitespace() { return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); } /** - Test if the queue matches a word character (letter or digit). - @return if matches a word character + * Test if the queue matches a word character (letter or digit). + * + * @return if matches a word character */ public boolean matchesWord() { return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); @@ -144,11 +168,14 @@ public class TokenQueue { * Drops the next character off the queue. */ public void advance() { - if (!isEmpty()) pos++; + if (!isEmpty()) { + pos++; + } } /** * Consume one character off queue. + * * @return first character on queue. */ public char consume() { @@ -156,25 +183,36 @@ public class TokenQueue { } /** - * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will - * throw an illegal state exception -- but you should be running match() against that condition. - <p> - Case insensitive. - * @param seq sequence to remove from head of queue. + * Consumes the supplied sequence of the queue. If the queue does not start + * with the supplied sequence, will throw an illegal state exception -- but + * you should be running match() against that condition. + * <p> + * Case insensitive. + * + * @param seq + * sequence to remove from head of queue. */ public void consume(String seq) { - if (!matches(seq)) - throw new IllegalStateException("Queue did not match expected sequence"); + if (!matches(seq)) { + throw new IllegalStateException( + "Queue did not match expected sequence"); + } int len = seq.length(); - if (len > remainingLength()) - throw new IllegalStateException("Queue not long enough to consume sequence"); - + if (len > remainingLength()) { + throw new IllegalStateException( + "Queue not long enough to consume sequence"); + } + pos += len; } /** - * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. - * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> + * Pulls a string off the queue, up to but exclusive of the match sequence, + * or to the queue running out. + * + * @param seq + * String to end on (and not include in return, but leave on + * queue). <b>Case sensitive.</b> * @return The matched data consumed from queue. */ public String consumeTo(String seq) { @@ -187,38 +225,52 @@ public class TokenQueue { return remainder(); } } - + public String consumeToIgnoreCase(String seq) { int start = pos; String first = seq.substring(0, 1); - boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of + boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if + // first + // is + // not + // cased, + // use + // index + // of while (!isEmpty()) { - if (matches(seq)) + if (matches(seq)) { break; - + } + if (canScan) { int skip = queue.indexOf(first, pos) - pos; - if (skip == 0) // this char is the skip char, but not match, so force advance of pos + if (skip == 0) { pos++; - else if (skip < 0) // no chance of finding, grab to end + } else if (skip < 0) { pos = queue.length(); - else + } else { pos += skip; - } - else + } + } else { pos++; + } } - String data = queue.substring(start, pos); - return data; + String data = queue.substring(start, pos); + return data; } /** - Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. - @param seq any number of terminators to consume to. <b>Case insensitive.</b> - @return consumed string + * Consumes to the first sequence provided, or to the end of the queue. + * Leaves the terminator on the queue. + * + * @param seq + * any number of terminators to consume to. <b>Case + * insensitive.</b> + * @return consumed string */ - // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this + // todo: method name. not good that consumeTo cares for case, and consume to + // any doesn't. And the only use for this // is is a case sensitive time... public String consumeToAny(String... seq) { int start = pos; @@ -226,16 +278,20 @@ public class TokenQueue { pos++; } - String data = queue.substring(start, pos); - return data; + String data = queue.substring(start, pos); + return data; } /** - * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). + * Pulls a string off the queue (like consumeTo), and then pulls off the + * matched string (but does not return it). * <p> - * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go - * isEmpty() == true). - * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> + * If the queue runs out of characters before finding the seq, will return + * as much as it can (and queue will go isEmpty() == true). + * + * @param seq + * String to match up to, and not include in return, and to pull + * off queue. <b>Case sensitive.</b> * @return Data matched from queue. */ public String chompTo(String seq) { @@ -243,7 +299,7 @@ public class TokenQueue { matchChomp(seq); return data; } - + public String chompToIgnoreCase(String seq) { String data = consumeToIgnoreCase(seq); // case insensitive scan matchChomp(seq); @@ -251,12 +307,17 @@ public class TokenQueue { } /** - * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", - * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left - * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for + * Pulls a balanced string off the queue. E.g. if queue is + * "(one (two) three) four", (,) will return "one (two) three", and leave + * " four" on the queue. Unbalanced openers and closers can be escaped (with + * \). Those escapes will be left in the returned string, which is suitable + * for regexes (where we need to preserve the escape), but unsuitable for * contains text strings; use unescape for that. - * @param open opener - * @param close closer + * + * @param open + * opener + * @param close + * closer * @return data matched from the queue */ public String chompBalanced(char open, char close) { @@ -265,25 +326,32 @@ public class TokenQueue { char last = 0; do { - if (isEmpty()) break; + if (isEmpty()) { + break; + } Character c = consume(); if (last == 0 || last != ESC) { - if (c.equals(open)) + if (c.equals(open)) { depth++; - else if (c.equals(close)) + } else if (c.equals(close)) { depth--; + } } - if (depth > 0 && last != 0) - accum.append(c); // don't include the outer match pair in the return + if (depth > 0 && last != 0) { + accum.append(c); // don't include the outer match pair in the + // return + } last = c; } while (depth > 0); return accum.toString(); } - + /** * Unescaped a \ escaped string. - * @param in backslash escaped string + * + * @param in + * backslash escaped string * @return unescaped string */ public static String unescape(String in) { @@ -291,11 +359,12 @@ public class TokenQueue { char last = 0; for (char c : in.toCharArray()) { if (c == ESC) { - if (last != 0 && last == ESC) + if (last != 0 && last == ESC) { out.append(c); - } - else + } + } else { out.append(c); + } last = c; } return out.toString(); @@ -315,15 +384,17 @@ public class TokenQueue { /** * Retrieves the next run of word type (letter or digit) off the queue. + * * @return String of word characters from queue, or empty string if none. */ public String consumeWord() { int start = pos; - while (matchesWord()) + while (matchesWord()) { pos++; + } return queue.substring(start, pos); } - + /** * Consume an tag name off the queue (word or :, _, -) * @@ -331,53 +402,61 @@ public class TokenQueue { */ public String consumeTagName() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) + while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) { pos++; - + } + return queue.substring(start, pos); } - + /** - * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). + * Consume a CSS element selector (tag name, but | instead of : for + * namespaces, to not conflict with :pseudo selects). * * @return tag name */ public String consumeElementSelector() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) + while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) { pos++; - + } + return queue.substring(start, pos); } /** - Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) - http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier - @return identifier + * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, + * _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier + * + * @return identifier */ public String consumeCssIdentifier() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) + while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) { pos++; + } return queue.substring(start, pos); } /** - Consume an attribute key off the queue (letter, digit, -, _, :") - @return attribute key + * Consume an attribute key off the queue (letter, digit, -, _, :") + * + * @return attribute key */ public String consumeAttributeKey() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) + while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) { pos++; - + } + return queue.substring(start, pos); } /** - Consume and return whatever is left on the queue. - @return remained of queue. + * Consume and return whatever is left on the queue. + * + * @return remained of queue. */ public String remainder() { StringBuilder accum = new StringBuilder(); @@ -386,7 +465,8 @@ public class TokenQueue { } return accum.toString(); } - + + @Override public String toString() { return queue.substring(pos); } |