1 files changed, 230 insertions, 0 deletions
diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java
new file mode 100644
index 0000000000..ce6ee690d6
--- /dev/null
+++ b/server/src/org/jsoup/parser/Tokeniser.java
@@ -0,0 +1,230 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Entities;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Readers the input stream into tokens.
+ */
+class Tokeniser {
+    static final char replacementChar = '\uFFFD'; // replaces null character
+
+    private CharacterReader reader; // html input
+    private ParseErrorList errors; // errors found while tokenising
+
+    private TokeniserState state = TokeniserState.Data; // current tokenisation state
+    private Token emitPending; // the token we are about to emit on next read
+    private boolean isEmitPending = false;
+    private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token
+    StringBuilder dataBuffer; // buffers data looking for </script>
+
+    Token.Tag tagPending; // tag we are building up
+    Token.Doctype doctypePending; // doctype building up
+    Token.Comment commentPending; // comment building up
+    private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag
+    private boolean selfClosingFlagAcknowledged = true;
+
+    Tokeniser(CharacterReader reader, ParseErrorList errors) {
+        this.reader = reader;
+        this.errors = errors;
+    }
+
+    Token read() {
+        if (!selfClosingFlagAcknowledged) {
+            error("Self closing flag not acknowledged");
+            selfClosingFlagAcknowledged = true;
+        }
+
+        while (!isEmitPending)
+            state.read(this, reader);
+
+        // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+        if (charBuffer.length() > 0) {
+            String str = charBuffer.toString();
+            charBuffer.delete(0, charBuffer.length());
+            return new Token.Character(str);
+        } else {
+            isEmitPending = false;
+            return emitPending;
+        }
+    }
+
+    void emit(Token token) {
+        Validate.isFalse(isEmitPending, "There is an unread token pending!");
+
+        emitPending = token;
+        isEmitPending = true;
+
+        if (token.type == Token.TokenType.StartTag) {
+            Token.StartTag startTag = (Token.StartTag) token;
+            lastStartTag = startTag;
+            if (startTag.selfClosing)
+                selfClosingFlagAcknowledged = false;
+        } else if (token.type == Token.TokenType.EndTag) {
+            Token.EndTag endTag = (Token.EndTag) token;
+            if (endTag.attributes.size() > 0)
+                error("Attributes incorrectly present on end tag");
+        }
+    }
+
+    void emit(String str) {
+        // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+        // does not set isEmitPending; read checks that
+        charBuffer.append(str);
+    }
+
+    void emit(char c) {
+        charBuffer.append(c);
+    }
+
+    TokeniserState getState() {
+        return state;
+    }
+
+    void transition(TokeniserState state) {
+        this.state = state;
+    }
+
+    void advanceTransition(TokeniserState state) {
+        reader.advance();
+        this.state = state;
+    }
+
+    void acknowledgeSelfClosingFlag() {
+        selfClosingFlagAcknowledged = true;
+    }
+
+    Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+        if (reader.isEmpty())
+            return null;
+        if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
+            return null;
+        if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&'))
+            return null;
+
+        reader.mark();
+        if (reader.matchConsume("#")) { // numbered
+            boolean isHexMode = reader.matchConsumeIgnoreCase("X");
+            String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
+            if (numRef.length() == 0) { // didn't match anything
+                characterReferenceError("numeric reference with no numerals");
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            int charval = -1;
+            try {
+                int base = isHexMode ? 16 : 10;
+                charval = Integer.valueOf(numRef, base);
+            } catch (NumberFormatException e) {
+            } // skip
+            if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+                characterReferenceError("character outside of valid range");
+                return replacementChar;
+            } else {
+                // todo: implement number replacement table
+                // todo: check for extra illegal unicode points as parse errors
+                return (char) charval;
+            }
+        } else { // named
+            // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found
+            String nameRef = reader.consumeLetterThenDigitSequence();
+            String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches
+            boolean looksLegit = reader.matches(';');
+            boolean found = false;
+            while (nameRef.length() > 0 && !found) {
+                if (Entities.isNamedEntity(nameRef))
+                    found = true;
+                else {
+                    nameRef = nameRef.substring(0, nameRef.length()-1);
+                    reader.unconsume();
+                }
+            }
+            if (!found) {
+                if (looksLegit) // named with semicolon
+                    characterReferenceError(String.format("invalid named referenece '%s'", origNameRef));
+                reader.rewindToMark();
+                return null;
+            }
+            if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
+                // don't want that to match
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            return Entities.getCharacterByName(nameRef);
+        }
+    }
+
+    Token.Tag createTagPending(boolean start) {
+        tagPending = start ? new Token.StartTag() : new Token.EndTag();
+        return tagPending;
+    }
+
+    void emitTagPending() {
+        tagPending.finaliseTag();
+        emit(tagPending);
+    }
+
+    void createCommentPending() {
+        commentPending = new Token.Comment();
+    }
+
+    void emitCommentPending() {
+        emit(commentPending);
+    }
+
+    void createDoctypePending() {
+        doctypePending = new Token.Doctype();
+    }
+
+    void emitDoctypePending() {
+        emit(doctypePending);
+    }
+
+    void createTempBuffer() {
+        dataBuffer = new StringBuilder();
+    }
+
+    boolean isAppropriateEndTagToken() {
+        if (lastStartTag == null)
+            return false;
+        return tagPending.tagName.equals(lastStartTag.tagName);
+    }
+
+    String appropriateEndTagName() {
+        return lastStartTag.tagName;
+    }
+
+    void error(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
+    }
+
+    void eofError(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
+    }
+
+    private void characterReferenceError(String message) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
+    }
+
+    private void error(String errorMsg) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), errorMsg));
+    }
+
+    boolean currentNodeInHtmlNS() {
+        // todo: implement namespaces correctly
+        return true;
+        // Element currentNode = currentNode();
+        // return currentNode != null && currentNode.namespace().equals("HTML");
+    }
+}