aboutsummaryrefslogtreecommitdiffstats
path: root/src/org/jsoup/parser/Tokeniser.java
diff options
context:
space:
mode:
authorArtur Signell <artur@vaadin.com>2012-08-13 18:34:33 +0300
committerArtur Signell <artur@vaadin.com>2012-08-13 19:18:33 +0300
commite85d933b25cc3c5cc85eb7eb4b13b950fd8e1569 (patch)
tree9ab6f13f7188cab44bbd979b1cf620f15328a03f /src/org/jsoup/parser/Tokeniser.java
parent14dd4d0b28c76eb994b181a4570f3adec53342e6 (diff)
downloadvaadin-framework-e85d933b25cc3c5cc85eb7eb4b13b950fd8e1569.tar.gz
vaadin-framework-e85d933b25cc3c5cc85eb7eb4b13b950fd8e1569.zip
Moved server files to a server src folder (#9299)
Diffstat (limited to 'src/org/jsoup/parser/Tokeniser.java')
-rw-r--r--src/org/jsoup/parser/Tokeniser.java230
1 files changed, 0 insertions, 230 deletions
diff --git a/src/org/jsoup/parser/Tokeniser.java b/src/org/jsoup/parser/Tokeniser.java
deleted file mode 100644
index ce6ee690d6..0000000000
--- a/src/org/jsoup/parser/Tokeniser.java
+++ /dev/null
@@ -1,230 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Entities;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Readers the input stream into tokens.
- */
-class Tokeniser {
- static final char replacementChar = '\uFFFD'; // replaces null character
-
- private CharacterReader reader; // html input
- private ParseErrorList errors; // errors found while tokenising
-
- private TokeniserState state = TokeniserState.Data; // current tokenisation state
- private Token emitPending; // the token we are about to emit on next read
- private boolean isEmitPending = false;
- private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token
- StringBuilder dataBuffer; // buffers data looking for </script>
-
- Token.Tag tagPending; // tag we are building up
- Token.Doctype doctypePending; // doctype building up
- Token.Comment commentPending; // comment building up
- private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag
- private boolean selfClosingFlagAcknowledged = true;
-
- Tokeniser(CharacterReader reader, ParseErrorList errors) {
- this.reader = reader;
- this.errors = errors;
- }
-
- Token read() {
- if (!selfClosingFlagAcknowledged) {
- error("Self closing flag not acknowledged");
- selfClosingFlagAcknowledged = true;
- }
-
- while (!isEmitPending)
- state.read(this, reader);
-
- // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
- if (charBuffer.length() > 0) {
- String str = charBuffer.toString();
- charBuffer.delete(0, charBuffer.length());
- return new Token.Character(str);
- } else {
- isEmitPending = false;
- return emitPending;
- }
- }
-
- void emit(Token token) {
- Validate.isFalse(isEmitPending, "There is an unread token pending!");
-
- emitPending = token;
- isEmitPending = true;
-
- if (token.type == Token.TokenType.StartTag) {
- Token.StartTag startTag = (Token.StartTag) token;
- lastStartTag = startTag;
- if (startTag.selfClosing)
- selfClosingFlagAcknowledged = false;
- } else if (token.type == Token.TokenType.EndTag) {
- Token.EndTag endTag = (Token.EndTag) token;
- if (endTag.attributes.size() > 0)
- error("Attributes incorrectly present on end tag");
- }
- }
-
- void emit(String str) {
- // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
- // does not set isEmitPending; read checks that
- charBuffer.append(str);
- }
-
- void emit(char c) {
- charBuffer.append(c);
- }
-
- TokeniserState getState() {
- return state;
- }
-
- void transition(TokeniserState state) {
- this.state = state;
- }
-
- void advanceTransition(TokeniserState state) {
- reader.advance();
- this.state = state;
- }
-
- void acknowledgeSelfClosingFlag() {
- selfClosingFlagAcknowledged = true;
- }
-
- Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
- if (reader.isEmpty())
- return null;
- if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
- return null;
- if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&'))
- return null;
-
- reader.mark();
- if (reader.matchConsume("#")) { // numbered
- boolean isHexMode = reader.matchConsumeIgnoreCase("X");
- String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
- if (numRef.length() == 0) { // didn't match anything
- characterReferenceError("numeric reference with no numerals");
- reader.rewindToMark();
- return null;
- }
- if (!reader.matchConsume(";"))
- characterReferenceError("missing semicolon"); // missing semi
- int charval = -1;
- try {
- int base = isHexMode ? 16 : 10;
- charval = Integer.valueOf(numRef, base);
- } catch (NumberFormatException e) {
- } // skip
- if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
- characterReferenceError("character outside of valid range");
- return replacementChar;
- } else {
- // todo: implement number replacement table
- // todo: check for extra illegal unicode points as parse errors
- return (char) charval;
- }
- } else { // named
- // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found
- String nameRef = reader.consumeLetterThenDigitSequence();
- String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches
- boolean looksLegit = reader.matches(';');
- boolean found = false;
- while (nameRef.length() > 0 && !found) {
- if (Entities.isNamedEntity(nameRef))
- found = true;
- else {
- nameRef = nameRef.substring(0, nameRef.length()-1);
- reader.unconsume();
- }
- }
- if (!found) {
- if (looksLegit) // named with semicolon
- characterReferenceError(String.format("invalid named referenece '%s'", origNameRef));
- reader.rewindToMark();
- return null;
- }
- if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
- // don't want that to match
- reader.rewindToMark();
- return null;
- }
- if (!reader.matchConsume(";"))
- characterReferenceError("missing semicolon"); // missing semi
- return Entities.getCharacterByName(nameRef);
- }
- }
-
- Token.Tag createTagPending(boolean start) {
- tagPending = start ? new Token.StartTag() : new Token.EndTag();
- return tagPending;
- }
-
- void emitTagPending() {
- tagPending.finaliseTag();
- emit(tagPending);
- }
-
- void createCommentPending() {
- commentPending = new Token.Comment();
- }
-
- void emitCommentPending() {
- emit(commentPending);
- }
-
- void createDoctypePending() {
- doctypePending = new Token.Doctype();
- }
-
- void emitDoctypePending() {
- emit(doctypePending);
- }
-
- void createTempBuffer() {
- dataBuffer = new StringBuilder();
- }
-
- boolean isAppropriateEndTagToken() {
- if (lastStartTag == null)
- return false;
- return tagPending.tagName.equals(lastStartTag.tagName);
- }
-
- String appropriateEndTagName() {
- return lastStartTag.tagName;
- }
-
- void error(TokeniserState state) {
- if (errors.canAddError())
- errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
- }
-
- void eofError(TokeniserState state) {
- if (errors.canAddError())
- errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
- }
-
- private void characterReferenceError(String message) {
- if (errors.canAddError())
- errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
- }
-
- private void error(String errorMsg) {
- if (errors.canAddError())
- errors.add(new ParseError(reader.pos(), errorMsg));
- }
-
- boolean currentNodeInHtmlNS() {
- // todo: implement namespaces correctly
- return true;
- // Element currentNode = currentNode();
- // return currentNode != null && currentNode.namespace().equals("HTML");
- }
-}