1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
package org.jsoup.parser;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.*;
import java.util.Iterator;
/**
* @author Jonathan Hedley
*/
public class XmlTreeBuilder extends TreeBuilder {
@Override
protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
super.initialiseParse(input, baseUri, errors);
stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
}
@Override
protected boolean process(Token token) {
// start tag, end tag, doctype, comment, character, eof
switch (token.type) {
case StartTag:
insert(token.asStartTag());
break;
case EndTag:
popStackToClose(token.asEndTag());
break;
case Comment:
insert(token.asComment());
break;
case Character:
insert(token.asCharacter());
break;
case Doctype:
insert(token.asDoctype());
break;
case EOF: // could put some normalisation here if desired
break;
default:
Validate.fail("Unexpected token type: " + token.type);
}
return true;
}
private void insertNode(Node node) {
currentElement().appendChild(node);
}
Element insert(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name());
// todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
Element el = new Element(tag, baseUri, startTag.attributes);
insertNode(el);
if (startTag.isSelfClosing()) {
tokeniser.acknowledgeSelfClosingFlag();
if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
tag.setSelfClosing();
} else {
stack.add(el);
}
return el;
}
void insert(Token.Comment commentToken) {
Comment comment = new Comment(commentToken.getData(), baseUri);
insertNode(comment);
}
void insert(Token.Character characterToken) {
Node node = new TextNode(characterToken.getData(), baseUri);
insertNode(node);
}
void insert(Token.Doctype d) {
DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
insertNode(doctypeNode);
}
/**
* If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
* found, skips.
*
* @param endTag
*/
private void popStackToClose(Token.EndTag endTag) {
String elName = endTag.name();
Element firstFound = null;
Iterator<Element> it = stack.descendingIterator();
while (it.hasNext()) {
Element next = it.next();
if (next.nodeName().equals(elName)) {
firstFound = next;
break;
}
}
if (firstFound == null)
return; // not found, skip
it = stack.descendingIterator();
while (it.hasNext()) {
Element next = it.next();
if (next == firstFound) {
it.remove();
break;
} else {
it.remove();
}
}
}
}
|