summaryrefslogtreecommitdiffstats
path: root/src/org/jsoup/parser/TokeniserState.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/org/jsoup/parser/TokeniserState.java')
-rw-r--r--src/org/jsoup/parser/TokeniserState.java1778
1 files changed, 1778 insertions, 0 deletions
diff --git a/src/org/jsoup/parser/TokeniserState.java b/src/org/jsoup/parser/TokeniserState.java
new file mode 100644
index 0000000000..e3013c73e9
--- /dev/null
+++ b/src/org/jsoup/parser/TokeniserState.java
@@ -0,0 +1,1778 @@
+package org.jsoup.parser;
+
+/**
+ * States and transition activations for the Tokeniser.
+ */
+enum TokeniserState {
+ Data {
+ // in data state, gather characters until a character reference or tag is found
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '&':
+ t.advanceTransition(CharacterReferenceInData);
+ break;
+ case '<':
+ t.advanceTransition(TagOpen);
+ break;
+ case nullChar:
+ t.error(this); // NOT replacement character (oddly?)
+ t.emit(r.consume());
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('&', '<', nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ CharacterReferenceInData {
+ // from & in data
+ void read(Tokeniser t, CharacterReader r) {
+ Character c = t.consumeCharacterReference(null, false);
+ if (c == null)
+ t.emit('&');
+ else
+ t.emit(c);
+ t.transition(Data);
+ }
+ },
+ Rcdata {
+ /// handles data in title, textarea etc
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '&':
+ t.advanceTransition(CharacterReferenceInRcdata);
+ break;
+ case '<':
+ t.advanceTransition(RcdataLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('&', '<', nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ CharacterReferenceInRcdata {
+ void read(Tokeniser t, CharacterReader r) {
+ Character c = t.consumeCharacterReference(null, false);
+ if (c == null)
+ t.emit('&');
+ else
+ t.emit(c);
+ t.transition(Rcdata);
+ }
+ },
+ Rawtext {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '<':
+ t.advanceTransition(RawtextLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('<', nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ ScriptData {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '<':
+ t.advanceTransition(ScriptDataLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('<', nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ PLAINTEXT {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeTo(nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ TagOpen {
+ // from < in data
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '!':
+ t.advanceTransition(MarkupDeclarationOpen);
+ break;
+ case '/':
+ t.advanceTransition(EndTagOpen);
+ break;
+ case '?':
+ t.advanceTransition(BogusComment);
+ break;
+ default:
+ if (r.matchesLetter()) {
+ t.createTagPending(true);
+ t.transition(TagName);
+ } else {
+ t.error(this);
+ t.emit('<'); // char that got us here
+ t.transition(Data);
+ }
+ break;
+ }
+ }
+ },
+ EndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.emit("</");
+ t.transition(Data);
+ } else if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.transition(TagName);
+ } else if (r.matches('>')) {
+ t.error(this);
+ t.advanceTransition(Data);
+ } else {
+ t.error(this);
+ t.advanceTransition(BogusComment);
+ }
+ }
+ },
+ TagName {
+ // from < or </ in data, will have start or end tag pending
+ void read(Tokeniser t, CharacterReader r) {
+ // previous TagOpen state did NOT consume, will have a letter char in current
+ String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase();
+ t.tagPending.appendTagName(tagName);
+
+ switch (r.consume()) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar: // replacement
+ t.tagPending.appendTagName(replacementStr);
+ break;
+ case eof: // should emit pending tag?
+ t.eofError(this);
+ t.transition(Data);
+ // no default, as covered with above consumeToAny
+ }
+ }
+ },
+ RcdataLessthanSign {
+ // from < in rcdata
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(RCDATAEndTagOpen);
+ } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
+ // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
+ // consuming to EOF; break out here
+ t.tagPending = new Token.EndTag(t.appropriateEndTagName());
+ t.emitTagPending();
+ r.unconsume(); // undo "<"
+ t.transition(Data);
+ } else {
+ t.emit("<");
+ t.transition(Rcdata);
+ }
+ }
+ },
+ RCDATAEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.tagPending.appendTagName(Character.toLowerCase(r.current()));
+ t.dataBuffer.append(Character.toLowerCase(r.current()));
+ t.advanceTransition(RCDATAEndTagName);
+ } else {
+ t.emit("</");
+ t.transition(Rcdata);
+ }
+ }
+ },
+ RCDATAEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name.toLowerCase());
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ if (t.isAppropriateEndTagToken())
+ t.transition(BeforeAttributeName);
+ else
+ anythingElse(t, r);
+ break;
+ case '/':
+ if (t.isAppropriateEndTagToken())
+ t.transition(SelfClosingStartTag);
+ else
+ anythingElse(t, r);
+ break;
+ case '>':
+ if (t.isAppropriateEndTagToken()) {
+ t.emitTagPending();
+ t.transition(Data);
+ }
+ else
+ anythingElse(t, r);
+ break;
+ default:
+ anythingElse(t, r);
+ }
+ }
+
+ private void anythingElse(Tokeniser t, CharacterReader r) {
+ t.emit("</" + t.dataBuffer.toString());
+ t.transition(Rcdata);
+ }
+ },
+ RawtextLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(RawtextEndTagOpen);
+ } else {
+ t.emit('<');
+ t.transition(Rawtext);
+ }
+ }
+ },
+ RawtextEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.transition(RawtextEndTagName);
+ } else {
+ t.emit("</");
+ t.transition(Rawtext);
+ }
+ }
+ },
+ RawtextEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name.toLowerCase());
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ default:
+ t.dataBuffer.append(c);
+ anythingElse(t, r);
+ }
+ } else
+ anythingElse(t, r);
+ }
+
+ private void anythingElse(Tokeniser t, CharacterReader r) {
+ t.emit("</" + t.dataBuffer.toString());
+ t.transition(Rawtext);
+ }
+ },
+ ScriptDataLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.consume()) {
+ case '/':
+ t.createTempBuffer();
+ t.transition(ScriptDataEndTagOpen);
+ break;
+ case '!':
+ t.emit("<!");
+ t.transition(ScriptDataEscapeStart);
+ break;
+ default:
+ t.emit("<");
+ r.unconsume();
+ t.transition(ScriptData);
+ }
+ }
+ },
+ ScriptDataEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.transition(ScriptDataEndTagName);
+ } else {
+ t.emit("</");
+ t.transition(ScriptData);
+ }
+
+ }
+ },
+ ScriptDataEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name.toLowerCase());
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ default:
+ t.dataBuffer.append(c);
+ anythingElse(t, r);
+ }
+ } else {
+ anythingElse(t, r);
+ }
+ }
+
+ private void anythingElse(Tokeniser t, CharacterReader r) {
+ t.emit("</" + t.dataBuffer.toString());
+ t.transition(ScriptData);
+ }
+ },
+ ScriptDataEscapeStart {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('-')) {
+ t.emit('-');
+ t.advanceTransition(ScriptDataEscapeStartDash);
+ } else {
+ t.transition(ScriptData);
+ }
+ }
+ },
+ ScriptDataEscapeStartDash {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('-')) {
+ t.emit('-');
+ t.advanceTransition(ScriptDataEscapedDashDash);
+ } else {
+ t.transition(ScriptData);
+ }
+ }
+ },
+ ScriptDataEscaped {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.transition(Data);
+ return;
+ }
+
+ switch (r.current()) {
+ case '-':
+ t.emit('-');
+ t.advanceTransition(ScriptDataEscapedDash);
+ break;
+ case '<':
+ t.advanceTransition(ScriptDataEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ default:
+ String data = r.consumeToAny('-', '<', nullChar);
+ t.emit(data);
+ }
+ }
+ },
+ ScriptDataEscapedDash {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.transition(Data);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ t.transition(ScriptDataEscapedDashDash);
+ break;
+ case '<':
+ t.transition(ScriptDataEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataEscaped);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedDashDash {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.transition(Data);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ break;
+ case '<':
+ t.transition(ScriptDataEscapedLessthanSign);
+ break;
+ case '>':
+ t.emit(c);
+ t.transition(ScriptData);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataEscaped);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTempBuffer();
+ t.dataBuffer.append(Character.toLowerCase(r.current()));
+ t.emit("<" + r.current());
+ t.advanceTransition(ScriptDataDoubleEscapeStart);
+ } else if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(ScriptDataEscapedEndTagOpen);
+ } else {
+ t.emit('<');
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.tagPending.appendTagName(Character.toLowerCase(r.current()));
+ t.dataBuffer.append(r.current());
+ t.advanceTransition(ScriptDataEscapedEndTagName);
+ } else {
+ t.emit("</");
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name.toLowerCase());
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ default:
+ t.dataBuffer.append(c);
+ anythingElse(t, r);
+ break;
+ }
+ } else {
+ anythingElse(t, r);
+ }
+ }
+
+ private void anythingElse(Tokeniser t, CharacterReader r) {
+ t.emit("</" + t.dataBuffer.toString());
+ t.transition(ScriptDataEscaped);
+ }
+ },
+ ScriptDataDoubleEscapeStart {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.dataBuffer.append(name.toLowerCase());
+ t.emit(name);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ if (t.dataBuffer.toString().equals("script"))
+ t.transition(ScriptDataDoubleEscaped);
+ else
+ t.transition(ScriptDataEscaped);
+ t.emit(c);
+ break;
+ default:
+ r.unconsume();
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscaped {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.current();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ t.advanceTransition(ScriptDataDoubleEscapedDash);
+ break;
+ case '<':
+ t.emit(c);
+ t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ String data = r.consumeToAny('-', '<', nullChar);
+ t.emit(data);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedDashDash);
+ break;
+ case '<':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataDoubleEscaped);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedDashDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ break;
+ case '<':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case '>':
+ t.emit(c);
+ t.transition(ScriptData);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataDoubleEscaped);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.emit('/');
+ t.createTempBuffer();
+ t.advanceTransition(ScriptDataDoubleEscapeEnd);
+ } else {
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapeEnd {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.dataBuffer.append(name.toLowerCase());
+ t.emit(name);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ if (t.dataBuffer.toString().equals("script"))
+ t.transition(ScriptDataEscaped);
+ else
+ t.transition(ScriptDataDoubleEscaped);
+ t.emit(c);
+ break;
+ default:
+ r.unconsume();
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ BeforeAttributeName {
+ // from tagname <xxx
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break; // ignore whitespace
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ t.error(this);
+ t.tagPending.newAttribute();
+ t.tagPending.appendAttributeName(c);
+ t.transition(AttributeName);
+ break;
+ default: // A-Z, anything else
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ }
+ }
+ },
+ AttributeName {
+ // from before attribute name
+ void read(Tokeniser t, CharacterReader r) {
+ String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<');
+ t.tagPending.appendAttributeName(name.toLowerCase());
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(AfterAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '=':
+ t.transition(BeforeAttributeValue);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeName(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ t.error(this);
+ t.tagPending.appendAttributeName(c);
+ // no default, as covered in consumeToAny
+ }
+ }
+ },
+ AfterAttributeName {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ // ignore
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '=':
+ t.transition(BeforeAttributeValue);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeName(replacementChar);
+ t.transition(AttributeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ t.error(this);
+ t.tagPending.newAttribute();
+ t.tagPending.appendAttributeName(c);
+ t.transition(AttributeName);
+ break;
+ default: // A-Z, anything else
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ }
+ }
+ },
+ BeforeAttributeValue {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ // ignore
+ break;
+ case '"':
+ t.transition(AttributeValue_doubleQuoted);
+ break;
+ case '&':
+ r.unconsume();
+ t.transition(AttributeValue_unquoted);
+ break;
+ case '\'':
+ t.transition(AttributeValue_singleQuoted);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ t.transition(AttributeValue_unquoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '>':
+ t.error(this);
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case '<':
+ case '=':
+ case '`':
+ t.error(this);
+ t.tagPending.appendAttributeValue(c);
+ t.transition(AttributeValue_unquoted);
+ break;
+ default:
+ r.unconsume();
+ t.transition(AttributeValue_unquoted);
+ }
+ }
+ },
+ AttributeValue_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAny('"', '&', nullChar);
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterAttributeValue_quoted);
+ break;
+ case '&':
+ Character ref = t.consumeCharacterReference('"', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ // no default, handled in consume to any above
+ }
+ }
+ },
+ AttributeValue_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAny('\'', '&', nullChar);
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterAttributeValue_quoted);
+ break;
+ case '&':
+ Character ref = t.consumeCharacterReference('\'', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ // no default, handled in consume to any above
+ }
+ }
+ },
+ AttributeValue_unquoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`');
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '&':
+ Character ref = t.consumeCharacterReference('>', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ case '`':
+ t.error(this);
+ t.tagPending.appendAttributeValue(c);
+ break;
+ // no default, handled in consume to any above
+ }
+
+ }
+ },
+ // CharacterReferenceInAttributeValue state handled inline
+ AfterAttributeValue_quoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ r.unconsume();
+ t.transition(BeforeAttributeName);
+ }
+
+ }
+ },
+ SelfClosingStartTag {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.tagPending.selfClosing = true;
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.transition(BeforeAttributeName);
+ }
+ }
+ },
+ BogusComment {
+ void read(Tokeniser t, CharacterReader r) {
+ // todo: handle bogus comment starting from eof. when does that trigger?
+ // rewind to capture character that lead us here
+ r.unconsume();
+ Token.Comment comment = new Token.Comment();
+ comment.data.append(r.consumeTo('>'));
+ // todo: replace nullChar with replaceChar
+ t.emit(comment);
+ t.advanceTransition(Data);
+ }
+ },
+ MarkupDeclarationOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchConsume("--")) {
+ t.createCommentPending();
+ t.transition(CommentStart);
+ } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
+ t.transition(Doctype);
+ } else if (r.matchConsume("[CDATA[")) {
+ // todo: should actually check current namepspace, and only non-html allows cdata. until namespace
+ // is implemented properly, keep handling as cdata
+ //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
+ t.transition(CdataSection);
+ } else {
+ t.error(this);
+ t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
+ }
+ }
+ },
+ CommentStart {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentStartDash);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '>':
+ t.error(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentStartDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentStartDash);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '>':
+ t.error(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ Comment {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.current();
+ switch (c) {
+ case '-':
+ t.advanceTransition(CommentEndDash);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.commentPending.data.append(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(r.consumeToAny('-', nullChar));
+ }
+ }
+ },
+ CommentEndDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentEnd);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append('-').append(replacementChar);
+ t.transition(Comment);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append('-').append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentEnd {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append("--").append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '!':
+ t.error(this);
+ t.transition(CommentEndBang);
+ break;
+ case '-':
+ t.error(this);
+ t.commentPending.data.append('-');
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.commentPending.data.append("--").append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentEndBang {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.commentPending.data.append("--!");
+ t.transition(CommentEndDash);
+ break;
+ case '>':
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append("--!").append(replacementChar);
+ t.transition(Comment);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append("--!").append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ Doctype {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.createDoctypePending();
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.transition(BeforeDoctypeName);
+ }
+ }
+ },
+ BeforeDoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createDoctypePending();
+ t.transition(DoctypeName);
+ return;
+ }
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break; // ignore whitespace
+ case nullChar:
+ t.error(this);
+ t.doctypePending.name.append(replacementChar);
+ t.transition(DoctypeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.createDoctypePending();
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.createDoctypePending();
+ t.doctypePending.name.append(c);
+ t.transition(DoctypeName);
+ }
+ }
+ },
+ DoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.doctypePending.name.append(name.toLowerCase());
+ return;
+ }
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(AfterDoctypeName);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.name.append(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.name.append(c);
+ }
+ }
+ },
+ AfterDoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ return;
+ }
+ if (r.matchesAny('\t', '\n', '\f', ' '))
+ r.advance(); // ignore whitespace
+ else if (r.matches('>')) {
+ t.emitDoctypePending();
+ t.advanceTransition(Data);
+ } else if (r.matchConsumeIgnoreCase("PUBLIC")) {
+ t.transition(AfterDoctypePublicKeyword);
+ } else if (r.matchConsumeIgnoreCase("SYSTEM")) {
+ t.transition(AfterDoctypeSystemKeyword);
+ } else {
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.advanceTransition(BogusDoctype);
+ }
+
+ }
+ },
+ AfterDoctypePublicKeyword {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypePublicIdentifier);
+ break;
+ case '"':
+ t.error(this);
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ BeforeDoctypePublicIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break;
+ case '"':
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ DoctypePublicIdentifier_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterDoctypePublicIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.publicIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.publicIdentifier.append(c);
+ }
+ }
+ },
+ DoctypePublicIdentifier_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterDoctypePublicIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.publicIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.publicIdentifier.append(c);
+ }
+ }
+ },
+ AfterDoctypePublicIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BetweenDoctypePublicAndSystemIdentifiers);
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ BetweenDoctypePublicAndSystemIdentifiers {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ AfterDoctypeSystemKeyword {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypeSystemIdentifier);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ }
+ }
+ },
+ BeforeDoctypeSystemIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break;
+ case '"':
+ // set system id to empty string
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ // set public id to empty string
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ DoctypeSystemIdentifier_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterDoctypeSystemIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.systemIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.systemIdentifier.append(c);
+ }
+ }
+ },
+ DoctypeSystemIdentifier_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterDoctypeSystemIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.systemIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.systemIdentifier.append(c);
+ }
+ }
+ },
+ AfterDoctypeSystemIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.transition(BogusDoctype);
+ // NOT force quirks
+ }
+ }
+ },
+ BogusDoctype {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ // ignore char
+ break;
+ }
+ }
+ },
+ CdataSection {
+ void read(Tokeniser t, CharacterReader r) {
+ String data = r.consumeTo("]]>");
+ t.emit(data);
+ r.matchConsume("]]>");
+ t.transition(Data);
+ }
+ };
+
+
+ abstract void read(Tokeniser t, CharacterReader r);
+
+ private static final char nullChar = '\u0000';
+ private static final char replacementChar = Tokeniser.replacementChar;
+ private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
+ private static final char eof = CharacterReader.EOF;
+}