diff options
Diffstat (limited to 'server/src/org/jsoup/safety/Cleaner.java')
-rw-r--r-- | server/src/org/jsoup/safety/Cleaner.java | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java new file mode 100644 index 0000000000..eda67df86b --- /dev/null +++ b/server/src/org/jsoup/safety/Cleaner.java @@ -0,0 +1,129 @@ +package org.jsoup.safety; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.*; +import org.jsoup.parser.Tag; + +import java.util.List; + +/** + The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes + that you are expecting; no junk, and no cross-site scripting attacks! + <p/> + The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain + HTML that is allowed by the whitelist. + <p/> + It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the + canned white-lists only allow body contained tags. + <p/> + Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. + */ +public class Cleaner { + private Whitelist whitelist; + + /** + Create a new cleaner, that sanitizes documents using the supplied whitelist. + @param whitelist white-list to clean with + */ + public Cleaner(Whitelist whitelist) { + Validate.notNull(whitelist); + this.whitelist = whitelist; + } + + /** + Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. + The original document is not modified. Only elements from the dirt document's <code>body</code> are used. + @param dirtyDocument Untrusted base document to clean. + @return cleaned document. + */ + public Document clean(Document dirtyDocument) { + Validate.notNull(dirtyDocument); + + Document clean = Document.createShell(dirtyDocument.baseUri()); + copySafeNodes(dirtyDocument.body(), clean.body()); + + return clean; + } + + /** + Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes + in the input HTML are allowed by the whitelist. + <p/> + This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully + using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document + to ensure enforced attributes are set correctly, and that the output is tidied. + @param dirtyDocument document to test + @return true if no tags or attributes need to be removed; false if they do + */ + public boolean isValid(Document dirtyDocument) { + Validate.notNull(dirtyDocument); + + Document clean = Document.createShell(dirtyDocument.baseUri()); + int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); + return numDiscarded == 0; + } + + /** + Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. + @param source source of HTML + @param dest destination element to copy into + @return number of discarded elements (that were considered unsafe) + */ + private int copySafeNodes(Element source, Element dest) { + List<Node> sourceChildren = source.childNodes(); + int numDiscarded = 0; + + for (Node sourceChild : sourceChildren) { + if (sourceChild instanceof Element) { + Element sourceEl = (Element) sourceChild; + + if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs + ElementMeta meta = createSafeElement(sourceEl); + Element destChild = meta.el; + dest.appendChild(destChild); + + numDiscarded += meta.numAttribsDiscarded; + numDiscarded += copySafeNodes(sourceEl, destChild); // recurs + } else { // not a safe tag, but it may have children (els or text) that are, so recurse + numDiscarded++; + numDiscarded += copySafeNodes(sourceEl, dest); + } + } else if (sourceChild instanceof TextNode) { + TextNode sourceText = (TextNode) sourceChild; + TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri()); + dest.appendChild(destText); + } // else, we don't care about comments, xml proc instructions, etc + } + return numDiscarded; + } + + private ElementMeta createSafeElement(Element sourceEl) { + String sourceTag = sourceEl.tagName(); + Attributes destAttrs = new Attributes(); + Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); + int numDiscarded = 0; + + Attributes sourceAttrs = sourceEl.attributes(); + for (Attribute sourceAttr : sourceAttrs) { + if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) + destAttrs.put(sourceAttr); + else + numDiscarded++; + } + Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); + destAttrs.addAll(enforcedAttrs); + + return new ElementMeta(dest, numDiscarded); + } + + private static class ElementMeta { + Element el; + int numAttribsDiscarded; + + ElementMeta(Element el, int numAttribsDiscarded) { + this.el = el; + this.numAttribsDiscarded = numAttribsDiscarded; + } + } + +} |