summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/safety/Cleaner.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/safety/Cleaner.java')
-rw-r--r--server/src/org/jsoup/safety/Cleaner.java106
1 files changed, 69 insertions, 37 deletions
diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java
index eda67df86b..046efbbaa8 100644
--- a/server/src/org/jsoup/safety/Cleaner.java
+++ b/server/src/org/jsoup/safety/Cleaner.java
@@ -1,29 +1,41 @@
package org.jsoup.safety;
+import java.util.List;
+
import org.jsoup.helper.Validate;
-import org.jsoup.nodes.*;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.Attributes;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
-import java.util.List;
-
/**
- The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
- that you are expecting; no junk, and no cross-site scripting attacks!
- <p/>
- The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
- HTML that is allowed by the whitelist.
- <p/>
- It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
- canned white-lists only allow body contained tags.
- <p/>
- Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
+ * The whitelist based HTML cleaner. Use to ensure that end-user provided HTML
+ * contains only the elements and attributes that you are expecting; no junk,
+ * and no cross-site scripting attacks!
+ * <p/>
+ * The HTML cleaner parses the input as HTML and then runs it through a
+ * white-list, so the output HTML can only contain HTML that is allowed by the
+ * whitelist.
+ * <p/>
+ * It is assumed that the input HTML is a body fragment; the clean methods only
+ * pull from the source's body, and the canned white-lists only allow body
+ * contained tags.
+ * <p/>
+ * Rather than interacting directly with a Cleaner object, generally see the
+ * {@code clean} methods in {@link org.jsoup.Jsoup}.
*/
public class Cleaner {
private Whitelist whitelist;
/**
- Create a new cleaner, that sanitizes documents using the supplied whitelist.
- @param whitelist white-list to clean with
+ * Create a new cleaner, that sanitizes documents using the supplied
+ * whitelist.
+ *
+ * @param whitelist
+ * white-list to clean with
*/
public Cleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
@@ -31,10 +43,14 @@ public class Cleaner {
}
/**
- Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
- The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
- @param dirtyDocument Untrusted base document to clean.
- @return cleaned document.
+ * Creates a new, clean document, from the original dirty document,
+ * containing only elements allowed by the whitelist. The original document
+ * is not modified. Only elements from the dirt document's <code>body</code>
+ * are used.
+ *
+ * @param dirtyDocument
+ * Untrusted base document to clean.
+ * @return cleaned document.
*/
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
@@ -46,14 +62,20 @@ public class Cleaner {
}
/**
- Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
- in the input HTML are allowed by the whitelist.
- <p/>
- This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
- using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
- to ensure enforced attributes are set correctly, and that the output is tidied.
- @param dirtyDocument document to test
- @return true if no tags or attributes need to be removed; false if they do
+ * Determines if the input document is valid, against the whitelist. It is
+ * considered valid if all the tags and attributes in the input HTML are
+ * allowed by the whitelist.
+ * <p/>
+ * This method can be used as a validator for user input forms. An invalid
+ * document will still be cleaned successfully using the
+ * {@link #clean(Document)} document. If using as a validator, it is
+ * recommended to still clean the document to ensure enforced attributes are
+ * set correctly, and that the output is tidied.
+ *
+ * @param dirtyDocument
+ * document to test
+ * @return true if no tags or attributes need to be removed; false if they
+ * do
*/
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
@@ -64,10 +86,14 @@ public class Cleaner {
}
/**
- Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
- @param source source of HTML
- @param dest destination element to copy into
- @return number of discarded elements (that were considered unsafe)
+ * Iterates the input and copies trusted nodes (tags, attributes, text) into
+ * the destination.
+ *
+ * @param source
+ * source of HTML
+ * @param dest
+ * destination element to copy into
+ * @return number of discarded elements (that were considered unsafe)
*/
private int copySafeNodes(Element source, Element dest) {
List<Node> sourceChildren = source.childNodes();
@@ -77,20 +103,24 @@ public class Cleaner {
if (sourceChild instanceof Element) {
Element sourceEl = (Element) sourceChild;
- if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
+ if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone
+ // and copy safe
+ // attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
dest.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
numDiscarded += copySafeNodes(sourceEl, destChild); // recurs
- } else { // not a safe tag, but it may have children (els or text) that are, so recurse
+ } else { // not a safe tag, but it may have children (els or
+ // text) that are, so recurse
numDiscarded++;
numDiscarded += copySafeNodes(sourceEl, dest);
}
} else if (sourceChild instanceof TextNode) {
TextNode sourceText = (TextNode) sourceChild;
- TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri());
+ TextNode destText = new TextNode(sourceText.getWholeText(),
+ sourceChild.baseUri());
dest.appendChild(destText);
} // else, we don't care about comments, xml proc instructions, etc
}
@@ -100,15 +130,17 @@ public class Cleaner {
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
- Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
+ Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(),
+ destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
- if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
+ if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) {
destAttrs.put(sourceAttr);
- else
+ } else {
numDiscarded++;
+ }
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);