aboutsummaryrefslogtreecommitdiffstats
path: root/src/org/jsoup/parser/TokeniserState.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/org/jsoup/parser/TokeniserState.java')
-rw-r--r--src/org/jsoup/parser/TokeniserState.java1778
1 files changed, 0 insertions, 1778 deletions
diff --git a/src/org/jsoup/parser/TokeniserState.java b/src/org/jsoup/parser/TokeniserState.java
deleted file mode 100644
index e3013c73e9..0000000000
--- a/src/org/jsoup/parser/TokeniserState.java
+++ /dev/null
@@ -1,1778 +0,0 @@
-package org.jsoup.parser;
-
-/**
- * States and transition activations for the Tokeniser.
- */
-enum TokeniserState {
- Data {
- // in data state, gather characters until a character reference or tag is found
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '&':
- t.advanceTransition(CharacterReferenceInData);
- break;
- case '<':
- t.advanceTransition(TagOpen);
- break;
- case nullChar:
- t.error(this); // NOT replacement character (oddly?)
- t.emit(r.consume());
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('&', '<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- CharacterReferenceInData {
- // from & in data
- void read(Tokeniser t, CharacterReader r) {
- Character c = t.consumeCharacterReference(null, false);
- if (c == null)
- t.emit('&');
- else
- t.emit(c);
- t.transition(Data);
- }
- },
- Rcdata {
- /// handles data in title, textarea etc
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '&':
- t.advanceTransition(CharacterReferenceInRcdata);
- break;
- case '<':
- t.advanceTransition(RcdataLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('&', '<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- CharacterReferenceInRcdata {
- void read(Tokeniser t, CharacterReader r) {
- Character c = t.consumeCharacterReference(null, false);
- if (c == null)
- t.emit('&');
- else
- t.emit(c);
- t.transition(Rcdata);
- }
- },
- Rawtext {
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '<':
- t.advanceTransition(RawtextLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- ScriptData {
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '<':
- t.advanceTransition(ScriptDataLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- PLAINTEXT {
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeTo(nullChar);
- t.emit(data);
- break;
- }
- }
- },
- TagOpen {
- // from < in data
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '!':
- t.advanceTransition(MarkupDeclarationOpen);
- break;
- case '/':
- t.advanceTransition(EndTagOpen);
- break;
- case '?':
- t.advanceTransition(BogusComment);
- break;
- default:
- if (r.matchesLetter()) {
- t.createTagPending(true);
- t.transition(TagName);
- } else {
- t.error(this);
- t.emit('<'); // char that got us here
- t.transition(Data);
- }
- break;
- }
- }
- },
- EndTagOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.emit("</");
- t.transition(Data);
- } else if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(TagName);
- } else if (r.matches('>')) {
- t.error(this);
- t.advanceTransition(Data);
- } else {
- t.error(this);
- t.advanceTransition(BogusComment);
- }
- }
- },
- TagName {
- // from < or </ in data, will have start or end tag pending
- void read(Tokeniser t, CharacterReader r) {
- // previous TagOpen state did NOT consume, will have a letter char in current
- String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase();
- t.tagPending.appendTagName(tagName);
-
- switch (r.consume()) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar: // replacement
- t.tagPending.appendTagName(replacementStr);
- break;
- case eof: // should emit pending tag?
- t.eofError(this);
- t.transition(Data);
- // no default, as covered with above consumeToAny
- }
- }
- },
- RcdataLessthanSign {
- // from < in rcdata
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(RCDATAEndTagOpen);
- } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
- // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
- // consuming to EOF; break out here
- t.tagPending = new Token.EndTag(t.appropriateEndTagName());
- t.emitTagPending();
- r.unconsume(); // undo "<"
- t.transition(Data);
- } else {
- t.emit("<");
- t.transition(Rcdata);
- }
- }
- },
- RCDATAEndTagOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.tagPending.appendTagName(Character.toLowerCase(r.current()));
- t.dataBuffer.append(Character.toLowerCase(r.current()));
- t.advanceTransition(RCDATAEndTagName);
- } else {
- t.emit("</");
- t.transition(Rcdata);
- }
- }
- },
- RCDATAEndTagName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- if (t.isAppropriateEndTagToken())
- t.transition(BeforeAttributeName);
- else
- anythingElse(t, r);
- break;
- case '/':
- if (t.isAppropriateEndTagToken())
- t.transition(SelfClosingStartTag);
- else
- anythingElse(t, r);
- break;
- case '>':
- if (t.isAppropriateEndTagToken()) {
- t.emitTagPending();
- t.transition(Data);
- }
- else
- anythingElse(t, r);
- break;
- default:
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(Rcdata);
- }
- },
- RawtextLessthanSign {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(RawtextEndTagOpen);
- } else {
- t.emit('<');
- t.transition(Rawtext);
- }
- }
- },
- RawtextEndTagOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(RawtextEndTagName);
- } else {
- t.emit("</");
- t.transition(Rawtext);
- }
- }
- },
- RawtextEndTagName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- }
- } else
- anythingElse(t, r);
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(Rawtext);
- }
- },
- ScriptDataLessthanSign {
- void read(Tokeniser t, CharacterReader r) {
- switch (r.consume()) {
- case '/':
- t.createTempBuffer();
- t.transition(ScriptDataEndTagOpen);
- break;
- case '!':
- t.emit("<!");
- t.transition(ScriptDataEscapeStart);
- break;
- default:
- t.emit("<");
- r.unconsume();
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEndTagOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(ScriptDataEndTagName);
- } else {
- t.emit("</");
- t.transition(ScriptData);
- }
-
- }
- },
- ScriptDataEndTagName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- }
- } else {
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(ScriptData);
- }
- },
- ScriptDataEscapeStart {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('-')) {
- t.emit('-');
- t.advanceTransition(ScriptDataEscapeStartDash);
- } else {
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEscapeStartDash {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('-')) {
- t.emit('-');
- t.advanceTransition(ScriptDataEscapedDashDash);
- } else {
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEscaped {
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- switch (r.current()) {
- case '-':
- t.emit('-');
- t.advanceTransition(ScriptDataEscapedDash);
- break;
- case '<':
- t.advanceTransition(ScriptDataEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- default:
- String data = r.consumeToAny('-', '<', nullChar);
- t.emit(data);
- }
- }
- },
- ScriptDataEscapedDash {
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- t.transition(ScriptDataEscapedDashDash);
- break;
- case '<':
- t.transition(ScriptDataEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataEscaped);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedDashDash {
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- break;
- case '<':
- t.transition(ScriptDataEscapedLessthanSign);
- break;
- case '>':
- t.emit(c);
- t.transition(ScriptData);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataEscaped);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedLessthanSign {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTempBuffer();
- t.dataBuffer.append(Character.toLowerCase(r.current()));
- t.emit("<" + r.current());
- t.advanceTransition(ScriptDataDoubleEscapeStart);
- } else if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(ScriptDataEscapedEndTagOpen);
- } else {
- t.emit('<');
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedEndTagOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.tagPending.appendTagName(Character.toLowerCase(r.current()));
- t.dataBuffer.append(r.current());
- t.advanceTransition(ScriptDataEscapedEndTagName);
- } else {
- t.emit("</");
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedEndTagName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- break;
- }
- } else {
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(ScriptDataEscaped);
- }
- },
- ScriptDataDoubleEscapeStart {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.dataBuffer.append(name.toLowerCase());
- t.emit(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- case '/':
- case '>':
- if (t.dataBuffer.toString().equals("script"))
- t.transition(ScriptDataDoubleEscaped);
- else
- t.transition(ScriptDataEscaped);
- t.emit(c);
- break;
- default:
- r.unconsume();
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataDoubleEscaped {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.current();
- switch (c) {
- case '-':
- t.emit(c);
- t.advanceTransition(ScriptDataDoubleEscapedDash);
- break;
- case '<':
- t.emit(c);
- t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- String data = r.consumeToAny('-', '<', nullChar);
- t.emit(data);
- }
- }
- },
- ScriptDataDoubleEscapedDash {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedDashDash);
- break;
- case '<':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataDoubleEscaped);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapedDashDash {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- break;
- case '<':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case '>':
- t.emit(c);
- t.transition(ScriptData);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataDoubleEscaped);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapedLessthanSign {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.emit('/');
- t.createTempBuffer();
- t.advanceTransition(ScriptDataDoubleEscapeEnd);
- } else {
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapeEnd {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.dataBuffer.append(name.toLowerCase());
- t.emit(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- case '/':
- case '>':
- if (t.dataBuffer.toString().equals("script"))
- t.transition(ScriptDataEscaped);
- else
- t.transition(ScriptDataDoubleEscaped);
- t.emit(c);
- break;
- default:
- r.unconsume();
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- BeforeAttributeName {
- // from tagname <xxx
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break; // ignore whitespace
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- case '=':
- t.error(this);
- t.tagPending.newAttribute();
- t.tagPending.appendAttributeName(c);
- t.transition(AttributeName);
- break;
- default: // A-Z, anything else
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- }
- }
- },
- AttributeName {
- // from before attribute name
- void read(Tokeniser t, CharacterReader r) {
- String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<');
- t.tagPending.appendAttributeName(name.toLowerCase());
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(AfterAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '=':
- t.transition(BeforeAttributeValue);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeName(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- t.error(this);
- t.tagPending.appendAttributeName(c);
- // no default, as covered in consumeToAny
- }
- }
- },
- AfterAttributeName {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- // ignore
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '=':
- t.transition(BeforeAttributeValue);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeName(replacementChar);
- t.transition(AttributeName);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- t.error(this);
- t.tagPending.newAttribute();
- t.tagPending.appendAttributeName(c);
- t.transition(AttributeName);
- break;
- default: // A-Z, anything else
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- }
- }
- },
- BeforeAttributeValue {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- // ignore
- break;
- case '"':
- t.transition(AttributeValue_doubleQuoted);
- break;
- case '&':
- r.unconsume();
- t.transition(AttributeValue_unquoted);
- break;
- case '\'':
- t.transition(AttributeValue_singleQuoted);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- t.transition(AttributeValue_unquoted);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '>':
- t.error(this);
- t.emitTagPending();
- t.transition(Data);
- break;
- case '<':
- case '=':
- case '`':
- t.error(this);
- t.tagPending.appendAttributeValue(c);
- t.transition(AttributeValue_unquoted);
- break;
- default:
- r.unconsume();
- t.transition(AttributeValue_unquoted);
- }
- }
- },
- AttributeValue_doubleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('"', '&', nullChar);
- if (value.length() > 0)
- t.tagPending.appendAttributeValue(value);
-
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterAttributeValue_quoted);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('"', true);
- if (ref != null)
- t.tagPending.appendAttributeValue(ref);
- else
- t.tagPending.appendAttributeValue('&');
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- // no default, handled in consume to any above
- }
- }
- },
- AttributeValue_singleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('\'', '&', nullChar);
- if (value.length() > 0)
- t.tagPending.appendAttributeValue(value);
-
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterAttributeValue_quoted);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('\'', true);
- if (ref != null)
- t.tagPending.appendAttributeValue(ref);
- else
- t.tagPending.appendAttributeValue('&');
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- // no default, handled in consume to any above
- }
- }
- },
- AttributeValue_unquoted {
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`');
- if (value.length() > 0)
- t.tagPending.appendAttributeValue(value);
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('>', true);
- if (ref != null)
- t.tagPending.appendAttributeValue(ref);
- else
- t.tagPending.appendAttributeValue('&');
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- case '=':
- case '`':
- t.error(this);
- t.tagPending.appendAttributeValue(c);
- break;
- // no default, handled in consume to any above
- }
-
- }
- },
- // CharacterReferenceInAttributeValue state handled inline
- AfterAttributeValue_quoted {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.error(this);
- r.unconsume();
- t.transition(BeforeAttributeName);
- }
-
- }
- },
- SelfClosingStartTag {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.tagPending.selfClosing = true;
- t.emitTagPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BeforeAttributeName);
- }
- }
- },
- BogusComment {
- void read(Tokeniser t, CharacterReader r) {
- // todo: handle bogus comment starting from eof. when does that trigger?
- // rewind to capture character that lead us here
- r.unconsume();
- Token.Comment comment = new Token.Comment();
- comment.data.append(r.consumeTo('>'));
- // todo: replace nullChar with replaceChar
- t.emit(comment);
- t.advanceTransition(Data);
- }
- },
- MarkupDeclarationOpen {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchConsume("--")) {
- t.createCommentPending();
- t.transition(CommentStart);
- } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
- t.transition(Doctype);
- } else if (r.matchConsume("[CDATA[")) {
- // todo: should actually check current namepspace, and only non-html allows cdata. until namespace
- // is implemented properly, keep handling as cdata
- //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
- t.transition(CdataSection);
- } else {
- t.error(this);
- t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
- }
- }
- },
- CommentStart {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentStartDash);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append(replacementChar);
- t.transition(Comment);
- break;
- case '>':
- t.error(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(c);
- t.transition(Comment);
- }
- }
- },
- CommentStartDash {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentStartDash);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append(replacementChar);
- t.transition(Comment);
- break;
- case '>':
- t.error(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(c);
- t.transition(Comment);
- }
- }
- },
- Comment {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.current();
- switch (c) {
- case '-':
- t.advanceTransition(CommentEndDash);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.commentPending.data.append(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(r.consumeToAny('-', nullChar));
- }
- }
- },
- CommentEndDash {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentEnd);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append('-').append(replacementChar);
- t.transition(Comment);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append('-').append(c);
- t.transition(Comment);
- }
- }
- },
- CommentEnd {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitCommentPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append("--").append(replacementChar);
- t.transition(Comment);
- break;
- case '!':
- t.error(this);
- t.transition(CommentEndBang);
- break;
- case '-':
- t.error(this);
- t.commentPending.data.append('-');
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.commentPending.data.append("--").append(c);
- t.transition(Comment);
- }
- }
- },
- CommentEndBang {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.commentPending.data.append("--!");
- t.transition(CommentEndDash);
- break;
- case '>':
- t.emitCommentPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append("--!").append(replacementChar);
- t.transition(Comment);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append("--!").append(c);
- t.transition(Comment);
- }
- }
- },
- Doctype {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypeName);
- break;
- case eof:
- t.eofError(this);
- t.createDoctypePending();
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BeforeDoctypeName);
- }
- }
- },
- BeforeDoctypeName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createDoctypePending();
- t.transition(DoctypeName);
- return;
- }
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break; // ignore whitespace
- case nullChar:
- t.error(this);
- t.doctypePending.name.append(replacementChar);
- t.transition(DoctypeName);
- break;
- case eof:
- t.eofError(this);
- t.createDoctypePending();
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.createDoctypePending();
- t.doctypePending.name.append(c);
- t.transition(DoctypeName);
- }
- }
- },
- DoctypeName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.doctypePending.name.append(name.toLowerCase());
- return;
- }
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(AfterDoctypeName);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.name.append(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.name.append(c);
- }
- }
- },
- AfterDoctypeName {
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- return;
- }
- if (r.matchesAny('\t', '\n', '\f', ' '))
- r.advance(); // ignore whitespace
- else if (r.matches('>')) {
- t.emitDoctypePending();
- t.advanceTransition(Data);
- } else if (r.matchConsumeIgnoreCase("PUBLIC")) {
- t.transition(AfterDoctypePublicKeyword);
- } else if (r.matchConsumeIgnoreCase("SYSTEM")) {
- t.transition(AfterDoctypeSystemKeyword);
- } else {
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.advanceTransition(BogusDoctype);
- }
-
- }
- },
- AfterDoctypePublicKeyword {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypePublicIdentifier);
- break;
- case '"':
- t.error(this);
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- BeforeDoctypePublicIdentifier {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '"':
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_doubleQuoted);
- break;
- case '\'':
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- DoctypePublicIdentifier_doubleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterDoctypePublicIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.publicIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.publicIdentifier.append(c);
- }
- }
- },
- DoctypePublicIdentifier_singleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterDoctypePublicIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.publicIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.publicIdentifier.append(c);
- }
- }
- },
- AfterDoctypePublicIdentifier {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BetweenDoctypePublicAndSystemIdentifiers);
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- BetweenDoctypePublicAndSystemIdentifiers {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- AfterDoctypeSystemKeyword {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypeSystemIdentifier);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- }
- }
- },
- BeforeDoctypeSystemIdentifier {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '"':
- // set system id to empty string
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- // set public id to empty string
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- DoctypeSystemIdentifier_doubleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterDoctypeSystemIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.systemIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.systemIdentifier.append(c);
- }
- }
- },
- DoctypeSystemIdentifier_singleQuoted {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterDoctypeSystemIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.systemIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.systemIdentifier.append(c);
- }
- }
- },
- AfterDoctypeSystemIdentifier {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BogusDoctype);
- // NOT force quirks
- }
- }
- },
- BogusDoctype {
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- // ignore char
- break;
- }
- }
- },
- CdataSection {
- void read(Tokeniser t, CharacterReader r) {
- String data = r.consumeTo("]]>");
- t.emit(data);
- r.matchConsume("]]>");
- t.transition(Data);
- }
- };
-
-
- abstract void read(Tokeniser t, CharacterReader r);
-
- private static final char nullChar = '\u0000';
- private static final char replacementChar = Tokeniser.replacementChar;
- private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
- private static final char eof = CharacterReader.EOF;
-}