1 files changed, 350 insertions, 0 deletions
diff --git a/server/src/org/jsoup/nodes/Document.java b/server/src/org/jsoup/nodes/Document.java
new file mode 100644
index 0000000000..adb371ce14
--- /dev/null
+++ b/server/src/org/jsoup/nodes/Document.java
@@ -0,0 +1,350 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ A HTML Document.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class Document extends Element {
+    private OutputSettings outputSettings = new OutputSettings();
+    private QuirksMode quirksMode = QuirksMode.noQuirks;
+
+    /**
+     Create a new, empty Document.
+     @param baseUri base URI of document
+     @see org.jsoup.Jsoup#parse
+     @see #createShell
+     */
+    public Document(String baseUri) {
+        super(Tag.valueOf("#root"), baseUri);
+    }
+
+    /**
+     Create a valid, empty shell of a document, suitable for adding more elements to.
+     @param baseUri baseUri of document
+     @return document with html, head, and body elements.
+     */
+    static public Document createShell(String baseUri) {
+        Validate.notNull(baseUri);
+
+        Document doc = new Document(baseUri);
+        Element html = doc.appendElement("html");
+        html.appendElement("head");
+        html.appendElement("body");
+
+        return doc;
+    }
+
+    /**
+     Accessor to the document's {@code head} element.
+     @return {@code head}
+     */
+    public Element head() {
+        return findFirstElementByTagName("head", this);
+    }
+
+    /**
+     Accessor to the document's {@code body} element.
+     @return {@code body}
+     */
+    public Element body() {
+        return findFirstElementByTagName("body", this);
+    }
+
+    /**
+     Get the string contents of the document's {@code title} element.
+     @return Trimmed title, or empty string if none set.
+     */
+    public String title() {
+        Element titleEl = getElementsByTag("title").first();
+        return titleEl != null ? titleEl.text().trim() : "";
+    }
+
+    /**
+     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
+     not present
+     @param title string to set as title
+     */
+    public void title(String title) {
+        Validate.notNull(title);
+        Element titleEl = getElementsByTag("title").first();
+        if (titleEl == null) { // add to head
+            head().appendElement("title").text(title);
+        } else {
+            titleEl.text(title);
+        }
+    }
+
+    /**
+     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
+     @param tagName element tag name (e.g. {@code a})
+     @return new element
+     */
+    public Element createElement(String tagName) {
+        return new Element(Tag.valueOf(tagName), this.baseUri());
+    }
+
+    /**
+     Normalise the document. This happens after the parse phase so generally does not need to be called.
+     Moves any text content that is not in the body element into the body.
+     @return this document after normalisation
+     */
+    public Document normalise() {
+        Element htmlEl = findFirstElementByTagName("html", this);
+        if (htmlEl == null)
+            htmlEl = appendElement("html");
+        if (head() == null)
+            htmlEl.prependElement("head");
+        if (body() == null)
+            htmlEl.appendElement("body");
+
+        // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care
+        // of. do in inverse order to maintain text order.
+        normaliseTextNodes(head());
+        normaliseTextNodes(htmlEl);
+        normaliseTextNodes(this);
+
+        normaliseStructure("head", htmlEl);
+        normaliseStructure("body", htmlEl);
+        
+        return this;
+    }
+
+    // does not recurse.
+    private void normaliseTextNodes(Element element) {
+        List<Node> toMove = new ArrayList<Node>();
+        for (Node node: element.childNodes) {
+            if (node instanceof TextNode) {
+                TextNode tn = (TextNode) node;
+                if (!tn.isBlank())
+                    toMove.add(tn);
+            }
+        }
+
+        for (int i = toMove.size()-1; i >= 0; i--) {
+            Node node = toMove.get(i);
+            element.removeChild(node);
+            body().prependChild(new TextNode(" ", ""));
+            body().prependChild(node);
+        }
+    }
+
+    // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html>
+    private void normaliseStructure(String tag, Element htmlEl) {
+        Elements elements = this.getElementsByTag(tag);
+        Element master = elements.first(); // will always be available as created above if not existent
+        if (elements.size() > 1) { // dupes, move contents to master
+            List<Node> toMove = new ArrayList<Node>();
+            for (int i = 1; i < elements.size(); i++) {
+                Node dupe = elements.get(i);
+                for (Node node : dupe.childNodes)
+                    toMove.add(node);
+                dupe.remove();
+            }
+
+            for (Node dupe : toMove)
+                master.appendChild(dupe);
+        }
+        // ensure parented by <html>
+        if (!master.parent().equals(htmlEl)) {
+            htmlEl.appendChild(master); // includes remove()            
+        }
+    }
+
+    // fast method to get first by tag name, used for html, head, body finders
+    private Element findFirstElementByTagName(String tag, Node node) {
+        if (node.nodeName().equals(tag))
+            return (Element) node;
+        else {
+            for (Node child: node.childNodes) {
+                Element found = findFirstElementByTagName(tag, child);
+                if (found != null)
+                    return found;
+            }
+        }
+        return null;
+    }
+
+    @Override
+    public String outerHtml() {
+        return super.html(); // no outer wrapper tag
+    }
+
+    /**
+     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
+     @param text unencoded text
+     @return this document
+     */
+    @Override
+    public Element text(String text) {
+        body().text(text); // overridden to not nuke doc structure
+        return this;
+    }
+
+    @Override
+    public String nodeName() {
+        return "#document";
+    }
+
+    @Override
+    public Document clone() {
+        Document clone = (Document) super.clone();
+        clone.outputSettings = this.outputSettings.clone();
+        return clone;
+    }
+
+    /**
+     * A Document's output settings control the form of the text() and html() methods.
+     */
+    public static class OutputSettings implements Cloneable {
+        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
+        private Charset charset = Charset.forName("UTF-8");
+        private CharsetEncoder charsetEncoder = charset.newEncoder();
+        private boolean prettyPrint = true;
+        private int indentAmount = 1;
+
+        public OutputSettings() {}
+
+        /**
+         * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
+         * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
+         * which uses the complete set of HTML named entities.
+         * <p>
+         * The default escape mode is <code>base</code>.
+         * @return the document's current escape mode
+         */
+        public Entities.EscapeMode escapeMode() {
+            return escapeMode;
+        }
+
+        /**
+         * Set the document's escape mode
+         * @param escapeMode the new escape mode to use
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
+            this.escapeMode = escapeMode;
+            return this;
+        }
+
+        /**
+         * Get the document's current output charset, which is used to control which characters are escaped when
+         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
+         * <p>
+         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
+         * input charset. Otherwise, it defaults to UTF-8.
+         * @return the document's current charset.
+         */
+        public Charset charset() {
+            return charset;
+        }
+
+        /**
+         * Update the document's output charset.
+         * @param charset the new charset to use.
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings charset(Charset charset) {
+            // todo: this should probably update the doc's meta charset
+            this.charset = charset;
+            charsetEncoder = charset.newEncoder();
+            return this;
+        }
+
+        /**
+         * Update the document's output charset.
+         * @param charset the new charset (by name) to use.
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings charset(String charset) {
+            charset(Charset.forName(charset));
+            return this;
+        }
+
+        CharsetEncoder encoder() {
+            return charsetEncoder;
+        }
+
+        /**
+         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
+         * the output, and the output will generally look like the input.
+         * @return if pretty printing is enabled.
+         */
+        public boolean prettyPrint() {
+            return prettyPrint;
+        }
+
+        /**
+         * Enable or disable pretty printing.
+         * @param pretty new pretty print setting
+         * @return this, for chaining
+         */
+        public OutputSettings prettyPrint(boolean pretty) {
+            prettyPrint = pretty;
+            return this;
+        }
+
+        /**
+         * Get the current tag indent amount, used when pretty printing.
+         * @return the current indent amount
+         */
+        public int indentAmount() {
+            return indentAmount;
+        }
+
+        /**
+         * Set the indent amount for pretty printing
+         * @param indentAmount number of spaces to use for indenting each level. Must be >= 0.
+         * @return this, for chaining
+         */
+        public OutputSettings indentAmount(int indentAmount) {
+            Validate.isTrue(indentAmount >= 0);
+            this.indentAmount = indentAmount;
+            return this;
+        }
+
+        @Override
+        public OutputSettings clone() {
+            OutputSettings clone;
+            try {
+                clone = (OutputSettings) super.clone();
+            } catch (CloneNotSupportedException e) {
+                throw new RuntimeException(e);
+            }
+            clone.charset(charset.name()); // new charset and charset encoder
+            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
+            // indentAmount, prettyPrint are primitives so object.clone() will handle
+            return clone;
+        }
+    }
+
+    /**
+     * Get the document's current output settings.
+     * @return the document's current output settings.
+     */
+    public OutputSettings outputSettings() {
+        return outputSettings;
+    }
+
+    public enum QuirksMode {
+        noQuirks, quirks, limitedQuirks;
+    }
+
+    public QuirksMode quirksMode() {
+        return quirksMode;
+    }
+
+    public Document quirksMode(QuirksMode quirksMode) {
+        this.quirksMode = quirksMode;
+        return this;
+    }
+}
+