summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/safety/Cleaner.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/safety/Cleaner.java')
-rw-r--r--server/src/org/jsoup/safety/Cleaner.java129
1 files changed, 129 insertions, 0 deletions
diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java
new file mode 100644
index 0000000000..eda67df86b
--- /dev/null
+++ b/server/src/org/jsoup/safety/Cleaner.java
@@ -0,0 +1,129 @@
+package org.jsoup.safety;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.*;
+import org.jsoup.parser.Tag;
+
+import java.util.List;
+
+/**
+ The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
+ that you are expecting; no junk, and no cross-site scripting attacks!
+ <p/>
+ The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
+ HTML that is allowed by the whitelist.
+ <p/>
+ It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
+ canned white-lists only allow body contained tags.
+ <p/>
+ Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
+ */
+public class Cleaner {
+ private Whitelist whitelist;
+
+ /**
+ Create a new cleaner, that sanitizes documents using the supplied whitelist.
+ @param whitelist white-list to clean with
+ */
+ public Cleaner(Whitelist whitelist) {
+ Validate.notNull(whitelist);
+ this.whitelist = whitelist;
+ }
+
+ /**
+ Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
+ The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
+ @param dirtyDocument Untrusted base document to clean.
+ @return cleaned document.
+ */
+ public Document clean(Document dirtyDocument) {
+ Validate.notNull(dirtyDocument);
+
+ Document clean = Document.createShell(dirtyDocument.baseUri());
+ copySafeNodes(dirtyDocument.body(), clean.body());
+
+ return clean;
+ }
+
+ /**
+ Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
+ in the input HTML are allowed by the whitelist.
+ <p/>
+ This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
+ using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
+ to ensure enforced attributes are set correctly, and that the output is tidied.
+ @param dirtyDocument document to test
+ @return true if no tags or attributes need to be removed; false if they do
+ */
+ public boolean isValid(Document dirtyDocument) {
+ Validate.notNull(dirtyDocument);
+
+ Document clean = Document.createShell(dirtyDocument.baseUri());
+ int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
+ return numDiscarded == 0;
+ }
+
+ /**
+ Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
+ @param source source of HTML
+ @param dest destination element to copy into
+ @return number of discarded elements (that were considered unsafe)
+ */
+ private int copySafeNodes(Element source, Element dest) {
+ List<Node> sourceChildren = source.childNodes();
+ int numDiscarded = 0;
+
+ for (Node sourceChild : sourceChildren) {
+ if (sourceChild instanceof Element) {
+ Element sourceEl = (Element) sourceChild;
+
+ if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
+ ElementMeta meta = createSafeElement(sourceEl);
+ Element destChild = meta.el;
+ dest.appendChild(destChild);
+
+ numDiscarded += meta.numAttribsDiscarded;
+ numDiscarded += copySafeNodes(sourceEl, destChild); // recurs
+ } else { // not a safe tag, but it may have children (els or text) that are, so recurse
+ numDiscarded++;
+ numDiscarded += copySafeNodes(sourceEl, dest);
+ }
+ } else if (sourceChild instanceof TextNode) {
+ TextNode sourceText = (TextNode) sourceChild;
+ TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri());
+ dest.appendChild(destText);
+ } // else, we don't care about comments, xml proc instructions, etc
+ }
+ return numDiscarded;
+ }
+
+ private ElementMeta createSafeElement(Element sourceEl) {
+ String sourceTag = sourceEl.tagName();
+ Attributes destAttrs = new Attributes();
+ Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
+ int numDiscarded = 0;
+
+ Attributes sourceAttrs = sourceEl.attributes();
+ for (Attribute sourceAttr : sourceAttrs) {
+ if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
+ destAttrs.put(sourceAttr);
+ else
+ numDiscarded++;
+ }
+ Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
+ destAttrs.addAll(enforcedAttrs);
+
+ return new ElementMeta(dest, numDiscarded);
+ }
+
+ private static class ElementMeta {
+ Element el;
+ int numAttribsDiscarded;
+
+ ElementMeta(Element el, int numAttribsDiscarded) {
+ this.el = el;
+ this.numAttribsDiscarded = numAttribsDiscarded;
+ }
+ }
+
+}