diff options
Diffstat (limited to 'server/src/org/jsoup/nodes/Document.java')
-rw-r--r-- | server/src/org/jsoup/nodes/Document.java | 350 |
1 files changed, 350 insertions, 0 deletions
diff --git a/server/src/org/jsoup/nodes/Document.java b/server/src/org/jsoup/nodes/Document.java new file mode 100644 index 0000000000..adb371ce14 --- /dev/null +++ b/server/src/org/jsoup/nodes/Document.java @@ -0,0 +1,350 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; +import org.jsoup.parser.Tag; +import org.jsoup.select.Elements; + +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.util.ArrayList; +import java.util.List; + +/** + A HTML Document. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class Document extends Element { + private OutputSettings outputSettings = new OutputSettings(); + private QuirksMode quirksMode = QuirksMode.noQuirks; + + /** + Create a new, empty Document. + @param baseUri base URI of document + @see org.jsoup.Jsoup#parse + @see #createShell + */ + public Document(String baseUri) { + super(Tag.valueOf("#root"), baseUri); + } + + /** + Create a valid, empty shell of a document, suitable for adding more elements to. + @param baseUri baseUri of document + @return document with html, head, and body elements. + */ + static public Document createShell(String baseUri) { + Validate.notNull(baseUri); + + Document doc = new Document(baseUri); + Element html = doc.appendElement("html"); + html.appendElement("head"); + html.appendElement("body"); + + return doc; + } + + /** + Accessor to the document's {@code head} element. + @return {@code head} + */ + public Element head() { + return findFirstElementByTagName("head", this); + } + + /** + Accessor to the document's {@code body} element. + @return {@code body} + */ + public Element body() { + return findFirstElementByTagName("body", this); + } + + /** + Get the string contents of the document's {@code title} element. + @return Trimmed title, or empty string if none set. + */ + public String title() { + Element titleEl = getElementsByTag("title").first(); + return titleEl != null ? titleEl.text().trim() : ""; + } + + /** + Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if + not present + @param title string to set as title + */ + public void title(String title) { + Validate.notNull(title); + Element titleEl = getElementsByTag("title").first(); + if (titleEl == null) { // add to head + head().appendElement("title").text(title); + } else { + titleEl.text(title); + } + } + + /** + Create a new Element, with this document's base uri. Does not make the new element a child of this document. + @param tagName element tag name (e.g. {@code a}) + @return new element + */ + public Element createElement(String tagName) { + return new Element(Tag.valueOf(tagName), this.baseUri()); + } + + /** + Normalise the document. This happens after the parse phase so generally does not need to be called. + Moves any text content that is not in the body element into the body. + @return this document after normalisation + */ + public Document normalise() { + Element htmlEl = findFirstElementByTagName("html", this); + if (htmlEl == null) + htmlEl = appendElement("html"); + if (head() == null) + htmlEl.prependElement("head"); + if (body() == null) + htmlEl.appendElement("body"); + + // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care + // of. do in inverse order to maintain text order. + normaliseTextNodes(head()); + normaliseTextNodes(htmlEl); + normaliseTextNodes(this); + + normaliseStructure("head", htmlEl); + normaliseStructure("body", htmlEl); + + return this; + } + + // does not recurse. + private void normaliseTextNodes(Element element) { + List<Node> toMove = new ArrayList<Node>(); + for (Node node: element.childNodes) { + if (node instanceof TextNode) { + TextNode tn = (TextNode) node; + if (!tn.isBlank()) + toMove.add(tn); + } + } + + for (int i = toMove.size()-1; i >= 0; i--) { + Node node = toMove.get(i); + element.removeChild(node); + body().prependChild(new TextNode(" ", "")); + body().prependChild(node); + } + } + + // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html> + private void normaliseStructure(String tag, Element htmlEl) { + Elements elements = this.getElementsByTag(tag); + Element master = elements.first(); // will always be available as created above if not existent + if (elements.size() > 1) { // dupes, move contents to master + List<Node> toMove = new ArrayList<Node>(); + for (int i = 1; i < elements.size(); i++) { + Node dupe = elements.get(i); + for (Node node : dupe.childNodes) + toMove.add(node); + dupe.remove(); + } + + for (Node dupe : toMove) + master.appendChild(dupe); + } + // ensure parented by <html> + if (!master.parent().equals(htmlEl)) { + htmlEl.appendChild(master); // includes remove() + } + } + + // fast method to get first by tag name, used for html, head, body finders + private Element findFirstElementByTagName(String tag, Node node) { + if (node.nodeName().equals(tag)) + return (Element) node; + else { + for (Node child: node.childNodes) { + Element found = findFirstElementByTagName(tag, child); + if (found != null) + return found; + } + } + return null; + } + + @Override + public String outerHtml() { + return super.html(); // no outer wrapper tag + } + + /** + Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. + @param text unencoded text + @return this document + */ + @Override + public Element text(String text) { + body().text(text); // overridden to not nuke doc structure + return this; + } + + @Override + public String nodeName() { + return "#document"; + } + + @Override + public Document clone() { + Document clone = (Document) super.clone(); + clone.outputSettings = this.outputSettings.clone(); + return clone; + } + + /** + * A Document's output settings control the form of the text() and html() methods. + */ + public static class OutputSettings implements Cloneable { + private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; + private Charset charset = Charset.forName("UTF-8"); + private CharsetEncoder charsetEncoder = charset.newEncoder(); + private boolean prettyPrint = true; + private int indentAmount = 1; + + public OutputSettings() {} + + /** + * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML + * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, + * which uses the complete set of HTML named entities. + * <p> + * The default escape mode is <code>base</code>. + * @return the document's current escape mode + */ + public Entities.EscapeMode escapeMode() { + return escapeMode; + } + + /** + * Set the document's escape mode + * @param escapeMode the new escape mode to use + * @return the document's output settings, for chaining + */ + public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { + this.escapeMode = escapeMode; + return this; + } + + /** + * Get the document's current output charset, which is used to control which characters are escaped when + * generating HTML (via the <code>html()</code> methods), and which are kept intact. + * <p> + * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the + * input charset. Otherwise, it defaults to UTF-8. + * @return the document's current charset. + */ + public Charset charset() { + return charset; + } + + /** + * Update the document's output charset. + * @param charset the new charset to use. + * @return the document's output settings, for chaining + */ + public OutputSettings charset(Charset charset) { + // todo: this should probably update the doc's meta charset + this.charset = charset; + charsetEncoder = charset.newEncoder(); + return this; + } + + /** + * Update the document's output charset. + * @param charset the new charset (by name) to use. + * @return the document's output settings, for chaining + */ + public OutputSettings charset(String charset) { + charset(Charset.forName(charset)); + return this; + } + + CharsetEncoder encoder() { + return charsetEncoder; + } + + /** + * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format + * the output, and the output will generally look like the input. + * @return if pretty printing is enabled. + */ + public boolean prettyPrint() { + return prettyPrint; + } + + /** + * Enable or disable pretty printing. + * @param pretty new pretty print setting + * @return this, for chaining + */ + public OutputSettings prettyPrint(boolean pretty) { + prettyPrint = pretty; + return this; + } + + /** + * Get the current tag indent amount, used when pretty printing. + * @return the current indent amount + */ + public int indentAmount() { + return indentAmount; + } + + /** + * Set the indent amount for pretty printing + * @param indentAmount number of spaces to use for indenting each level. Must be >= 0. + * @return this, for chaining + */ + public OutputSettings indentAmount(int indentAmount) { + Validate.isTrue(indentAmount >= 0); + this.indentAmount = indentAmount; + return this; + } + + @Override + public OutputSettings clone() { + OutputSettings clone; + try { + clone = (OutputSettings) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + clone.charset(charset.name()); // new charset and charset encoder + clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); + // indentAmount, prettyPrint are primitives so object.clone() will handle + return clone; + } + } + + /** + * Get the document's current output settings. + * @return the document's current output settings. + */ + public OutputSettings outputSettings() { + return outputSettings; + } + + public enum QuirksMode { + noQuirks, quirks, limitedQuirks; + } + + public QuirksMode quirksMode() { + return quirksMode; + } + + public Document quirksMode(QuirksMode quirksMode) { + this.quirksMode = quirksMode; + return this; + } +} + |