aboutsummaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/parser/Tokeniser.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/parser/Tokeniser.java')
-rw-r--r--server/src/org/jsoup/parser/Tokeniser.java230
1 files changed, 230 insertions, 0 deletions
diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java
new file mode 100644
index 0000000000..ce6ee690d6
--- /dev/null
+++ b/server/src/org/jsoup/parser/Tokeniser.java
@@ -0,0 +1,230 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Entities;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Readers the input stream into tokens.
+ */
+class Tokeniser {
+ static final char replacementChar = '\uFFFD'; // replaces null character
+
+ private CharacterReader reader; // html input
+ private ParseErrorList errors; // errors found while tokenising
+
+ private TokeniserState state = TokeniserState.Data; // current tokenisation state
+ private Token emitPending; // the token we are about to emit on next read
+ private boolean isEmitPending = false;
+ private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token
+ StringBuilder dataBuffer; // buffers data looking for </script>
+
+ Token.Tag tagPending; // tag we are building up
+ Token.Doctype doctypePending; // doctype building up
+ Token.Comment commentPending; // comment building up
+ private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag
+ private boolean selfClosingFlagAcknowledged = true;
+
+ Tokeniser(CharacterReader reader, ParseErrorList errors) {
+ this.reader = reader;
+ this.errors = errors;
+ }
+
+ Token read() {
+ if (!selfClosingFlagAcknowledged) {
+ error("Self closing flag not acknowledged");
+ selfClosingFlagAcknowledged = true;
+ }
+
+ while (!isEmitPending)
+ state.read(this, reader);
+
+ // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+ if (charBuffer.length() > 0) {
+ String str = charBuffer.toString();
+ charBuffer.delete(0, charBuffer.length());
+ return new Token.Character(str);
+ } else {
+ isEmitPending = false;
+ return emitPending;
+ }
+ }
+
+ void emit(Token token) {
+ Validate.isFalse(isEmitPending, "There is an unread token pending!");
+
+ emitPending = token;
+ isEmitPending = true;
+
+ if (token.type == Token.TokenType.StartTag) {
+ Token.StartTag startTag = (Token.StartTag) token;
+ lastStartTag = startTag;
+ if (startTag.selfClosing)
+ selfClosingFlagAcknowledged = false;
+ } else if (token.type == Token.TokenType.EndTag) {
+ Token.EndTag endTag = (Token.EndTag) token;
+ if (endTag.attributes.size() > 0)
+ error("Attributes incorrectly present on end tag");
+ }
+ }
+
+ void emit(String str) {
+ // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+ // does not set isEmitPending; read checks that
+ charBuffer.append(str);
+ }
+
+ void emit(char c) {
+ charBuffer.append(c);
+ }
+
+ TokeniserState getState() {
+ return state;
+ }
+
+ void transition(TokeniserState state) {
+ this.state = state;
+ }
+
+ void advanceTransition(TokeniserState state) {
+ reader.advance();
+ this.state = state;
+ }
+
+ void acknowledgeSelfClosingFlag() {
+ selfClosingFlagAcknowledged = true;
+ }
+
+ Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+ if (reader.isEmpty())
+ return null;
+ if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
+ return null;
+ if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&'))
+ return null;
+
+ reader.mark();
+ if (reader.matchConsume("#")) { // numbered
+ boolean isHexMode = reader.matchConsumeIgnoreCase("X");
+ String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
+ if (numRef.length() == 0) { // didn't match anything
+ characterReferenceError("numeric reference with no numerals");
+ reader.rewindToMark();
+ return null;
+ }
+ if (!reader.matchConsume(";"))
+ characterReferenceError("missing semicolon"); // missing semi
+ int charval = -1;
+ try {
+ int base = isHexMode ? 16 : 10;
+ charval = Integer.valueOf(numRef, base);
+ } catch (NumberFormatException e) {
+ } // skip
+ if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+ characterReferenceError("character outside of valid range");
+ return replacementChar;
+ } else {
+ // todo: implement number replacement table
+ // todo: check for extra illegal unicode points as parse errors
+ return (char) charval;
+ }
+ } else { // named
+ // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found
+ String nameRef = reader.consumeLetterThenDigitSequence();
+ String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches
+ boolean looksLegit = reader.matches(';');
+ boolean found = false;
+ while (nameRef.length() > 0 && !found) {
+ if (Entities.isNamedEntity(nameRef))
+ found = true;
+ else {
+ nameRef = nameRef.substring(0, nameRef.length()-1);
+ reader.unconsume();
+ }
+ }
+ if (!found) {
+ if (looksLegit) // named with semicolon
+ characterReferenceError(String.format("invalid named referenece '%s'", origNameRef));
+ reader.rewindToMark();
+ return null;
+ }
+ if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
+ // don't want that to match
+ reader.rewindToMark();
+ return null;
+ }
+ if (!reader.matchConsume(";"))
+ characterReferenceError("missing semicolon"); // missing semi
+ return Entities.getCharacterByName(nameRef);
+ }
+ }
+
+ Token.Tag createTagPending(boolean start) {
+ tagPending = start ? new Token.StartTag() : new Token.EndTag();
+ return tagPending;
+ }
+
+ void emitTagPending() {
+ tagPending.finaliseTag();
+ emit(tagPending);
+ }
+
+ void createCommentPending() {
+ commentPending = new Token.Comment();
+ }
+
+ void emitCommentPending() {
+ emit(commentPending);
+ }
+
+ void createDoctypePending() {
+ doctypePending = new Token.Doctype();
+ }
+
+ void emitDoctypePending() {
+ emit(doctypePending);
+ }
+
+ void createTempBuffer() {
+ dataBuffer = new StringBuilder();
+ }
+
+ boolean isAppropriateEndTagToken() {
+ if (lastStartTag == null)
+ return false;
+ return tagPending.tagName.equals(lastStartTag.tagName);
+ }
+
+ String appropriateEndTagName() {
+ return lastStartTag.tagName;
+ }
+
+ void error(TokeniserState state) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
+ }
+
+ void eofError(TokeniserState state) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
+ }
+
+ private void characterReferenceError(String message) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
+ }
+
+ private void error(String errorMsg) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), errorMsg));
+ }
+
+ boolean currentNodeInHtmlNS() {
+ // todo: implement namespaces correctly
+ return true;
+ // Element currentNode = currentNode();
+ // return currentNode != null && currentNode.namespace().equals("HTML");
+ }
+}