diff options
Diffstat (limited to 'server/src/org/jsoup/safety/Cleaner.java')
-rw-r--r-- | server/src/org/jsoup/safety/Cleaner.java | 106 |
1 files changed, 69 insertions, 37 deletions
diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java index eda67df86b..046efbbaa8 100644 --- a/server/src/org/jsoup/safety/Cleaner.java +++ b/server/src/org/jsoup/safety/Cleaner.java @@ -1,29 +1,41 @@ package org.jsoup.safety; +import java.util.List; + import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; -import java.util.List; - /** - The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes - that you are expecting; no junk, and no cross-site scripting attacks! - <p/> - The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain - HTML that is allowed by the whitelist. - <p/> - It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the - canned white-lists only allow body contained tags. - <p/> - Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. + * The whitelist based HTML cleaner. Use to ensure that end-user provided HTML + * contains only the elements and attributes that you are expecting; no junk, + * and no cross-site scripting attacks! + * <p/> + * The HTML cleaner parses the input as HTML and then runs it through a + * white-list, so the output HTML can only contain HTML that is allowed by the + * whitelist. + * <p/> + * It is assumed that the input HTML is a body fragment; the clean methods only + * pull from the source's body, and the canned white-lists only allow body + * contained tags. + * <p/> + * Rather than interacting directly with a Cleaner object, generally see the + * {@code clean} methods in {@link org.jsoup.Jsoup}. */ public class Cleaner { private Whitelist whitelist; /** - Create a new cleaner, that sanitizes documents using the supplied whitelist. - @param whitelist white-list to clean with + * Create a new cleaner, that sanitizes documents using the supplied + * whitelist. + * + * @param whitelist + * white-list to clean with */ public Cleaner(Whitelist whitelist) { Validate.notNull(whitelist); @@ -31,10 +43,14 @@ public class Cleaner { } /** - Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. - The original document is not modified. Only elements from the dirt document's <code>body</code> are used. - @param dirtyDocument Untrusted base document to clean. - @return cleaned document. + * Creates a new, clean document, from the original dirty document, + * containing only elements allowed by the whitelist. The original document + * is not modified. Only elements from the dirt document's <code>body</code> + * are used. + * + * @param dirtyDocument + * Untrusted base document to clean. + * @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); @@ -46,14 +62,20 @@ public class Cleaner { } /** - Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes - in the input HTML are allowed by the whitelist. - <p/> - This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully - using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document - to ensure enforced attributes are set correctly, and that the output is tidied. - @param dirtyDocument document to test - @return true if no tags or attributes need to be removed; false if they do + * Determines if the input document is valid, against the whitelist. It is + * considered valid if all the tags and attributes in the input HTML are + * allowed by the whitelist. + * <p/> + * This method can be used as a validator for user input forms. An invalid + * document will still be cleaned successfully using the + * {@link #clean(Document)} document. If using as a validator, it is + * recommended to still clean the document to ensure enforced attributes are + * set correctly, and that the output is tidied. + * + * @param dirtyDocument + * document to test + * @return true if no tags or attributes need to be removed; false if they + * do */ public boolean isValid(Document dirtyDocument) { Validate.notNull(dirtyDocument); @@ -64,10 +86,14 @@ public class Cleaner { } /** - Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. - @param source source of HTML - @param dest destination element to copy into - @return number of discarded elements (that were considered unsafe) + * Iterates the input and copies trusted nodes (tags, attributes, text) into + * the destination. + * + * @param source + * source of HTML + * @param dest + * destination element to copy into + * @return number of discarded elements (that were considered unsafe) */ private int copySafeNodes(Element source, Element dest) { List<Node> sourceChildren = source.childNodes(); @@ -77,20 +103,24 @@ public class Cleaner { if (sourceChild instanceof Element) { Element sourceEl = (Element) sourceChild; - if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs + if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone + // and copy safe + // attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; dest.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; numDiscarded += copySafeNodes(sourceEl, destChild); // recurs - } else { // not a safe tag, but it may have children (els or text) that are, so recurse + } else { // not a safe tag, but it may have children (els or + // text) that are, so recurse numDiscarded++; numDiscarded += copySafeNodes(sourceEl, dest); } } else if (sourceChild instanceof TextNode) { TextNode sourceText = (TextNode) sourceChild; - TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri()); + TextNode destText = new TextNode(sourceText.getWholeText(), + sourceChild.baseUri()); dest.appendChild(destText); } // else, we don't care about comments, xml proc instructions, etc } @@ -100,15 +130,17 @@ public class Cleaner { private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); - Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); + Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), + destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { - if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) + if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { destAttrs.put(sourceAttr); - else + } else { numDiscarded++; + } } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); |