diff options
author | Artur Signell <artur@vaadin.com> | 2012-08-13 18:34:33 +0300 |
---|---|---|
committer | Artur Signell <artur@vaadin.com> | 2012-08-13 19:18:33 +0300 |
commit | e85d933b25cc3c5cc85eb7eb4b13b950fd8e1569 (patch) | |
tree | 9ab6f13f7188cab44bbd979b1cf620f15328a03f /src/org/jsoup/parser/Tokeniser.java | |
parent | 14dd4d0b28c76eb994b181a4570f3adec53342e6 (diff) | |
download | vaadin-framework-e85d933b25cc3c5cc85eb7eb4b13b950fd8e1569.tar.gz vaadin-framework-e85d933b25cc3c5cc85eb7eb4b13b950fd8e1569.zip |
Moved server files to a server src folder (#9299)
Diffstat (limited to 'src/org/jsoup/parser/Tokeniser.java')
-rw-r--r-- | src/org/jsoup/parser/Tokeniser.java | 230 |
1 files changed, 0 insertions, 230 deletions
diff --git a/src/org/jsoup/parser/Tokeniser.java b/src/org/jsoup/parser/Tokeniser.java deleted file mode 100644 index ce6ee690d6..0000000000 --- a/src/org/jsoup/parser/Tokeniser.java +++ /dev/null @@ -1,230 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Entities; - -import java.util.ArrayList; -import java.util.List; - -/** - * Readers the input stream into tokens. - */ -class Tokeniser { - static final char replacementChar = '\uFFFD'; // replaces null character - - private CharacterReader reader; // html input - private ParseErrorList errors; // errors found while tokenising - - private TokeniserState state = TokeniserState.Data; // current tokenisation state - private Token emitPending; // the token we are about to emit on next read - private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token - StringBuilder dataBuffer; // buffers data looking for </script> - - Token.Tag tagPending; // tag we are building up - Token.Doctype doctypePending; // doctype building up - Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag - private boolean selfClosingFlagAcknowledged = true; - - Tokeniser(CharacterReader reader, ParseErrorList errors) { - this.reader = reader; - this.errors = errors; - } - - Token read() { - if (!selfClosingFlagAcknowledged) { - error("Self closing flag not acknowledged"); - selfClosingFlagAcknowledged = true; - } - - while (!isEmitPending) - state.read(this, reader); - - // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: - if (charBuffer.length() > 0) { - String str = charBuffer.toString(); - charBuffer.delete(0, charBuffer.length()); - return new Token.Character(str); - } else { - isEmitPending = false; - return emitPending; - } - } - - void emit(Token token) { - Validate.isFalse(isEmitPending, "There is an unread token pending!"); - - emitPending = token; - isEmitPending = true; - - if (token.type == Token.TokenType.StartTag) { - Token.StartTag startTag = (Token.StartTag) token; - lastStartTag = startTag; - if (startTag.selfClosing) - selfClosingFlagAcknowledged = false; - } else if (token.type == Token.TokenType.EndTag) { - Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) - error("Attributes incorrectly present on end tag"); - } - } - - void emit(String str) { - // buffer strings up until last string token found, to emit only one token for a run of character refs etc. - // does not set isEmitPending; read checks that - charBuffer.append(str); - } - - void emit(char c) { - charBuffer.append(c); - } - - TokeniserState getState() { - return state; - } - - void transition(TokeniserState state) { - this.state = state; - } - - void advanceTransition(TokeniserState state) { - reader.advance(); - this.state = state; - } - - void acknowledgeSelfClosingFlag() { - selfClosingFlagAcknowledged = true; - } - - Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { - if (reader.isEmpty()) - return null; - if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) - return null; - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) - return null; - - reader.mark(); - if (reader.matchConsume("#")) { // numbered - boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); - if (numRef.length() == 0) { // didn't match anything - characterReferenceError("numeric reference with no numerals"); - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) - characterReferenceError("missing semicolon"); // missing semi - int charval = -1; - try { - int base = isHexMode ? 16 : 10; - charval = Integer.valueOf(numRef, base); - } catch (NumberFormatException e) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { - characterReferenceError("character outside of valid range"); - return replacementChar; - } else { - // todo: implement number replacement table - // todo: check for extra illegal unicode points as parse errors - return (char) charval; - } - } else { // named - // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found - String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches - boolean looksLegit = reader.matches(';'); - boolean found = false; - while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) - found = true; - else { - nameRef = nameRef.substring(0, nameRef.length()-1); - reader.unconsume(); - } - } - if (!found) { - if (looksLegit) // named with semicolon - characterReferenceError(String.format("invalid named referenece '%s'", origNameRef)); - reader.rewindToMark(); - return null; - } - if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { - // don't want that to match - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) - characterReferenceError("missing semicolon"); // missing semi - return Entities.getCharacterByName(nameRef); - } - } - - Token.Tag createTagPending(boolean start) { - tagPending = start ? new Token.StartTag() : new Token.EndTag(); - return tagPending; - } - - void emitTagPending() { - tagPending.finaliseTag(); - emit(tagPending); - } - - void createCommentPending() { - commentPending = new Token.Comment(); - } - - void emitCommentPending() { - emit(commentPending); - } - - void createDoctypePending() { - doctypePending = new Token.Doctype(); - } - - void emitDoctypePending() { - emit(doctypePending); - } - - void createTempBuffer() { - dataBuffer = new StringBuilder(); - } - - boolean isAppropriateEndTagToken() { - if (lastStartTag == null) - return false; - return tagPending.tagName.equals(lastStartTag.tagName); - } - - String appropriateEndTagName() { - return lastStartTag.tagName; - } - - void error(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); - } - - void eofError(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); - } - - private void characterReferenceError(String message) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); - } - - private void error(String errorMsg) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), errorMsg)); - } - - boolean currentNodeInHtmlNS() { - // todo: implement namespaces correctly - return true; - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } -} |