server/src/org/jsoup/parser/XmlTreeBuilder.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

package org.jsoup.parser;

import java.util.Iterator;

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

/**
 * @author Jonathan Hedley
 */
public class XmlTreeBuilder extends TreeBuilder {
    @Override
    protected void initialiseParse(String input, String baseUri,
            ParseErrorList errors) {
        super.initialiseParse(input, baseUri, errors);
        stack.add(doc); // place the document onto the stack. differs from
                        // HtmlTreeBuilder (not on stack)
    }

    @Override
    protected boolean process(Token token) {
        // start tag, end tag, doctype, comment, character, eof
        switch (token.type) {
        case StartTag:
            insert(token.asStartTag());
            break;
        case EndTag:
            popStackToClose(token.asEndTag());
            break;
        case Comment:
            insert(token.asComment());
            break;
        case Character:
            insert(token.asCharacter());
            break;
        case Doctype:
            insert(token.asDoctype());
            break;
        case EOF: // could put some normalisation here if desired
            break;
        default:
            Validate.fail("Unexpected token type: " + token.type);
        }
        return true;
    }

    private void insertNode(Node node) {
        currentElement().appendChild(node);
    }

    Element insert(Token.StartTag startTag) {
        Tag tag = Tag.valueOf(startTag.name());
        // todo: wonder if for xml parsing, should treat all tags as unknown?
        // because it's not html.
        Element el = new Element(tag, baseUri, startTag.attributes);
        insertNode(el);
        if (startTag.isSelfClosing()) {
            tokeniser.acknowledgeSelfClosingFlag();
            if (!tag.isKnownTag()) {
                tag.setSelfClosing();
            }
        } else {
            stack.add(el);
        }
        return el;
    }

    void insert(Token.Comment commentToken) {
        Comment comment = new Comment(commentToken.getData(), baseUri);
        insertNode(comment);
    }

    void insert(Token.Character characterToken) {
        Node node = new TextNode(characterToken.getData(), baseUri);
        insertNode(node);
    }

    void insert(Token.Doctype d) {
        DocumentType doctypeNode = new DocumentType(d.getName(),
                d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
        insertNode(doctypeNode);
    }

    /**
     * If the stack contains an element with this tag's name, pop up the stack
     * to remove the first occurrence. If not found, skips.
     * 
     * @param endTag
     */
    private void popStackToClose(Token.EndTag endTag) {
        String elName = endTag.name();
        Element firstFound = null;

        Iterator<Element> it = stack.descendingIterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (next.nodeName().equals(elName)) {
                firstFound = next;
                break;
            }
        }
        if (firstFound == null) {
            return; // not found, skip
        }

        it = stack.descendingIterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (next == firstFound) {
                it.remove();
                break;
            } else {
                it.remove();
            }
        }
    }
}