server/src/org/jsoup/safety/Cleaner.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

package org.jsoup.safety;

import java.util.List;

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;

/**
 * The whitelist based HTML cleaner. Use to ensure that end-user provided HTML
 * contains only the elements and attributes that you are expecting; no junk,
 * and no cross-site scripting attacks!
 * <p/>
 * The HTML cleaner parses the input as HTML and then runs it through a
 * white-list, so the output HTML can only contain HTML that is allowed by the
 * whitelist.
 * <p/>
 * It is assumed that the input HTML is a body fragment; the clean methods only
 * pull from the source's body, and the canned white-lists only allow body
 * contained tags.
 * <p/>
 * Rather than interacting directly with a Cleaner object, generally see the
 * {@code clean} methods in {@link org.jsoup.Jsoup}.
 */
public class Cleaner {
    private Whitelist whitelist;

    /**
     * Create a new cleaner, that sanitizes documents using the supplied
     * whitelist.
     * 
     * @param whitelist
     *            white-list to clean with
     */
    public Cleaner(Whitelist whitelist) {
        Validate.notNull(whitelist);
        this.whitelist = whitelist;
    }

    /**
     * Creates a new, clean document, from the original dirty document,
     * containing only elements allowed by the whitelist. The original document
     * is not modified. Only elements from the dirt document's <code>body</code>
     * are used.
     * 
     * @param dirtyDocument
     *            Untrusted base document to clean.
     * @return cleaned document.
     */
    public Document clean(Document dirtyDocument) {
        Validate.notNull(dirtyDocument);

        Document clean = Document.createShell(dirtyDocument.baseUri());
        copySafeNodes(dirtyDocument.body(), clean.body());

        return clean;
    }

    /**
     * Determines if the input document is valid, against the whitelist. It is
     * considered valid if all the tags and attributes in the input HTML are
     * allowed by the whitelist.
     * <p/>
     * This method can be used as a validator for user input forms. An invalid
     * document will still be cleaned successfully using the
     * {@link #clean(Document)} document. If using as a validator, it is
     * recommended to still clean the document to ensure enforced attributes are
     * set correctly, and that the output is tidied.
     * 
     * @param dirtyDocument
     *            document to test
     * @return true if no tags or attributes need to be removed; false if they
     *         do
     */
    public boolean isValid(Document dirtyDocument) {
        Validate.notNull(dirtyDocument);

        Document clean = Document.createShell(dirtyDocument.baseUri());
        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
        return numDiscarded == 0;
    }

    /**
     * Iterates the input and copies trusted nodes (tags, attributes, text) into
     * the destination.
     * 
     * @param source
     *            source of HTML
     * @param dest
     *            destination element to copy into
     * @return number of discarded elements (that were considered unsafe)
     */
    private int copySafeNodes(Element source, Element dest) {
        List<Node> sourceChildren = source.childNodes();
        int numDiscarded = 0;

        for (Node sourceChild : sourceChildren) {
            if (sourceChild instanceof Element) {
                Element sourceEl = (Element) sourceChild;

                if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone
                                                               // and copy safe
                                                               // attrs
                    ElementMeta meta = createSafeElement(sourceEl);
                    Element destChild = meta.el;
                    dest.appendChild(destChild);

                    numDiscarded += meta.numAttribsDiscarded;
                    numDiscarded += copySafeNodes(sourceEl, destChild); // recurs
                } else { // not a safe tag, but it may have children (els or
                         // text) that are, so recurse
                    numDiscarded++;
                    numDiscarded += copySafeNodes(sourceEl, dest);
                }
            } else if (sourceChild instanceof TextNode) {
                TextNode sourceText = (TextNode) sourceChild;
                TextNode destText = new TextNode(sourceText.getWholeText(),
                        sourceChild.baseUri());
                dest.appendChild(destText);
            } // else, we don't care about comments, xml proc instructions, etc
        }
        return numDiscarded;
    }

    private ElementMeta createSafeElement(Element sourceEl) {
        String sourceTag = sourceEl.tagName();
        Attributes destAttrs = new Attributes();
        Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(),
                destAttrs);
        int numDiscarded = 0;

        Attributes sourceAttrs = sourceEl.attributes();
        for (Attribute sourceAttr : sourceAttrs) {
            if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) {
                destAttrs.put(sourceAttr);
            } else {
                numDiscarded++;
            }
        }
        Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
        destAttrs.addAll(enforcedAttrs);

        return new ElementMeta(dest, numDiscarded);
    }

    private static class ElementMeta {
        Element el;
        int numAttribsDiscarded;

        ElementMeta(Element el, int numAttribsDiscarded) {
            this.el = el;
            this.numAttribsDiscarded = numAttribsDiscarded;
        }
    }

}