diff options
Diffstat (limited to 'server/src/org/jsoup/parser/Tokeniser.java')
-rw-r--r-- | server/src/org/jsoup/parser/Tokeniser.java | 264 |
1 files changed, 0 insertions, 264 deletions
diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java deleted file mode 100644 index f46c962281..0000000000 --- a/server/src/org/jsoup/parser/Tokeniser.java +++ /dev/null @@ -1,264 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Entities; - -/** - * Readers the input stream into tokens. - */ -class Tokeniser { - static final char replacementChar = '\uFFFD'; // replaces null character - - private CharacterReader reader; // html input - private ParseErrorList errors; // errors found while tokenising - - private TokeniserState state = TokeniserState.Data; // current tokenisation - // state - private Token emitPending; // the token we are about to emit on next read - private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers - // characters to - // output as one - // token - StringBuilder dataBuffer; // buffers data looking for </script> - - Token.Tag tagPending; // tag we are building up - Token.Doctype doctypePending; // doctype building up - Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test - // appropriate end tag - private boolean selfClosingFlagAcknowledged = true; - - Tokeniser(CharacterReader reader, ParseErrorList errors) { - this.reader = reader; - this.errors = errors; - } - - Token read() { - if (!selfClosingFlagAcknowledged) { - error("Self closing flag not acknowledged"); - selfClosingFlagAcknowledged = true; - } - - while (!isEmitPending) { - state.read(this, reader); - } - - // if emit is pending, a non-character token was found: return any chars - // in buffer, and leave token for next read: - if (charBuffer.length() > 0) { - String str = charBuffer.toString(); - charBuffer.delete(0, charBuffer.length()); - return new Token.Character(str); - } else { - isEmitPending = false; - return emitPending; - } - } - - void emit(Token token) { - Validate.isFalse(isEmitPending, "There is an unread token pending!"); - - emitPending = token; - isEmitPending = true; - - if (token.type == Token.TokenType.StartTag) { - Token.StartTag startTag = (Token.StartTag) token; - lastStartTag = startTag; - if (startTag.selfClosing) { - selfClosingFlagAcknowledged = false; - } - } else if (token.type == Token.TokenType.EndTag) { - Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) { - error("Attributes incorrectly present on end tag"); - } - } - } - - void emit(String str) { - // buffer strings up until last string token found, to emit only one - // token for a run of character refs etc. - // does not set isEmitPending; read checks that - charBuffer.append(str); - } - - void emit(char c) { - charBuffer.append(c); - } - - TokeniserState getState() { - return state; - } - - void transition(TokeniserState state) { - this.state = state; - } - - void advanceTransition(TokeniserState state) { - reader.advance(); - this.state = state; - } - - void acknowledgeSelfClosingFlag() { - selfClosingFlagAcknowledged = true; - } - - Character consumeCharacterReference(Character additionalAllowedCharacter, - boolean inAttribute) { - if (reader.isEmpty()) { - return null; - } - if (additionalAllowedCharacter != null - && additionalAllowedCharacter == reader.current()) { - return null; - } - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) { - return null; - } - - reader.mark(); - if (reader.matchConsume("#")) { // numbered - boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader - .consumeDigitSequence(); - if (numRef.length() == 0) { // didn't match anything - characterReferenceError("numeric reference with no numerals"); - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - int charval = -1; - try { - int base = isHexMode ? 16 : 10; - charval = Integer.valueOf(numRef, base); - } catch (NumberFormatException e) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) - || charval > 0x10FFFF) { - characterReferenceError("character outside of valid range"); - return replacementChar; - } else { - // todo: implement number replacement table - // todo: check for extra illegal unicode points as parse errors - return (char) charval; - } - } else { // named - // get as many letters as possible, and look for matching entities. - // unconsume backwards till a match is found - String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. - // nameRef gets chomped - // looking for matches - boolean looksLegit = reader.matches(';'); - boolean found = false; - while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) { - found = true; - } else { - nameRef = nameRef.substring(0, nameRef.length() - 1); - reader.unconsume(); - } - } - if (!found) { - if (looksLegit) { - characterReferenceError(String.format( - "invalid named referenece '%s'", origNameRef)); - } - reader.rewindToMark(); - return null; - } - if (inAttribute - && (reader.matchesLetter() || reader.matchesDigit() || reader - .matchesAny('=', '-', '_'))) { - // don't want that to match - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - return Entities.getCharacterByName(nameRef); - } - } - - Token.Tag createTagPending(boolean start) { - tagPending = start ? new Token.StartTag() : new Token.EndTag(); - return tagPending; - } - - void emitTagPending() { - tagPending.finaliseTag(); - emit(tagPending); - } - - void createCommentPending() { - commentPending = new Token.Comment(); - } - - void emitCommentPending() { - emit(commentPending); - } - - void createDoctypePending() { - doctypePending = new Token.Doctype(); - } - - void emitDoctypePending() { - emit(doctypePending); - } - - void createTempBuffer() { - dataBuffer = new StringBuilder(); - } - - boolean isAppropriateEndTagToken() { - if (lastStartTag == null) { - return false; - } - return tagPending.tagName.equals(lastStartTag.tagName); - } - - String appropriateEndTagName() { - return lastStartTag.tagName; - } - - void error(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Unexpected character '%s' in input state [%s]", reader - .current(), state)); - } - } - - void eofError(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError( - reader.pos(), - "Unexpectedly reached end of file (EOF) in input state [%s]", - state)); - } - } - - private void characterReferenceError(String message) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Invalid character reference: %s", message)); - } - } - - private void error(String errorMsg) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), errorMsg)); - } - } - - boolean currentNodeInHtmlNS() { - // todo: implement namespaces correctly - return true; - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } -} |