1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
|
"""
This module contains a DTD parser that reports DTD parse events to a listener.
Used by xmlproc to parse DTDs, but can be used for other purposes as well.
$Id: dtdparser.py,v 1.7 2000/12/19 03:36:28 mikeolson Exp $
"""
from types import StringType
import string
string_find = string.find # optimization
from xmlutils import *
from xmldtd import *
# ==============================
# A DTD parser
# ==============================
class DTDParser(XMLCommonParser):
"A parser for XML DTDs, both internal and external."
# --- LOW-LEVEL SCANNING METHODS
# Redefined here with extra checking for parameter entity processing
def find_reg(self,regexp,required=1):
oldpos=self.pos
mo=regexp.search(self.data,self.pos)
if mo==None:
if self.final and not required:
self.pos=len(self.data) # Just moved to the end
return self.data[oldpos:]
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.find_reg(regexp,required)
raise OutOfDataException()
self.pos=mo.start(0)
return self.data[oldpos:self.pos]
def scan_to(self,target):
new_pos=string_find(self.data,target,self.pos)
if new_pos==-1:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.scan_to(target)
raise OutOfDataException()
res=self.data[self.pos:new_pos]
self.pos=new_pos+len(target)
return res
def get_index(self,target):
new_pos=string_find(self.data,target,self.pos)
if new_pos==-1:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.get_index(target)
raise OutOfDataException()
return new_pos
def test_str(self,str):
if self.datasize-self.pos<len(str) and not self.final:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.test_str(str)
raise OutOfDataException()
return self.data[self.pos:self.pos+len(str)]==str
def now_at(self,test_str):
if self.datasize-self.pos<len(test_str) and not self.final:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.now_at(test_str)
raise OutOfDataException()
if self.data[self.pos:self.pos+len(test_str)]==test_str:
self.pos=self.pos+len(test_str)
return 1
else:
return 0
def _skip_ws(self,necessary=0):
start=self.pos
try:
while self.data[self.pos] in whitespace:
self.pos=self.pos+1
if necessary and self.pos==start and self.data[self.pos]!="%":
self.report_error(3002)
except IndexError:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return
if necessary and start==self.pos:
if self.final:
self.report_error(3002)
else:
raise OutOfDataException()
def skip_ws(self,necessary=0):
self._skip_ws(necessary)
if not self.internal:
try:
if not self.now_at("%"):
return
except OutOfDataException:
return
name=self._get_name()
if not self.now_at(";"):
self.report_error(3005,";")
try:
ent=self.dtd.resolve_pe(name)
except KeyError,e:
self.report_error(3038,name)
return
if ent.is_internal():
self.in_peref=1
self.push_entity(self.get_current_sysid(),ent.value)
self.final=1 # Reset by pop_ent, needed for buffer handling
else:
self.report_error(4003)
# At this point we need to try again, since the entity we just
# tried may have contained only whitespace (or nothing at all).
# Using self._skip_ws() makes us fail when an empty PE is followed
# by a non-empty one. (DocBook has examples of this.)
self.skip_ws()
def test_reg(self,regexp):
if self.pos>self.datasize-5 and not self.final:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.test_reg(regexp)
raise OutOfDataException()
return regexp.match(self.data,self.pos)!=None
def get_match(self,regexp):
if self.pos>self.datasize-5 and not self.final:
if self.in_peref:
self.pop_entity()
self.in_peref=0
self._skip_ws()
return self.get_match(regexp)
raise OutOfDataException()
ent=regexp.match(self.data,self.pos)
if ent==None:
self.report_error(reg2code[regexp.pattern])
return ""
end=ent.end(0) # Speeds us up slightly
if end==self.datasize:
if self.in_peref:
self.pop_entity()
self.in_peref=0
#self._skip_ws()
return ent.group(0)
raise OutOfDataException()
self.pos=end
return ent.group(0)
# --- DTD Parser proper
def __init__(self):
EntityParser.__init__(self)
self.internal=0
self.seen_xmldecl=0
self.dtd=DTDConsumerPE() # Keeps track of PE info
self.dtd_consumer=self.dtd # Where all events go
self.in_peref=0
self.ignores_entered=0
self.includes_entered=0
self.own_ent_stack=[] # Keeps includes_entered
def reset(self):
EntityParser.reset(self)
if hasattr(self,"dtd"):
self.dtd.reset()
self.internal=0
self.seen_xmldecl=0
self.in_peref=0
self.ignores_entered=0
self.includes_entered=0
self.own_ent_stack=[] # Keeps includes_entered
self.dtd_start_called = 0 # Set to 1 if parsing external subset from
# xmlproc.py (which has called dtd_start...)
def parseStart(self):
if not self.dtd_start_called:
self.dtd_consumer.dtd_start()
def parseEnd(self):
self.dtd_consumer.dtd_end()
def set_dtd_consumer(self,dtd):
"Tells the parser where to send DTD information."
self.dtd_consumer=dtd
def set_dtd_object(self,dtd):
"""Tells the parser where to mirror PE information (in addition to
what goes to the DTD consumer and where to get PE information."""
self.dtd=dtd
def set_internal(self,yesno):
"Tells the parser whether the DTD is internal or external."
self.internal=yesno
def deref(self):
"Removes circular references."
self.ent = self.dtd_consumer = self.dtd = self.app = self.err = None
def do_parse(self):
"Does the actual parsing."
try:
prepos=self.pos
if self.ignores_entered>0:
self.parse_ignored_data()
self._skip_ws()
while self.pos<self.datasize:
if self.now_at("<!ELEMENT"):
self.parse_elem_type()
elif self.now_at("<!ENTITY"):
self.parse_entity()
elif self.now_at("<!ATTLIST"):
self.parse_attlist()
elif self.now_at("<!NOTATION"):
self.parse_notation()
elif self.test_reg(reg_pe_ref):
self.parse_pe_ref()
elif self.now_at("<?"):
self.parse_pi(self.dtd_consumer)
elif self.now_at("<!--"):
self.parse_comment(self.dtd_consumer)
elif self.now_at("<!["):
self.parse_conditional()
elif self.now_at("]]>") and self.includes_entered>0:
self.includes_entered=self.includes_entered-1
else:
self.report_error(3013)
self.scan_to(">")
prepos=self.pos
self._skip_ws()
if self.final and self.includes_entered>0:
self.report_error(3043)
except OutOfDataException,e:
if self.final:
raise e
else:
self.pos=prepos
except IndexError,e:
if self.final:
raise OutOfDataException()
else:
self.pos=prepos
def parse_entity(self):
"Parses an entity declaration."
EntityParser.skip_ws(self,1) # No PE refs allowed here
if self.now_at("%"):
pedecl=1
EntityParser.skip_ws(self,1) # No PE refs allowed here
else:
pedecl=0
ent_name=self._get_name()
self.skip_ws(1)
(pub_id,sys_id)=self.parse_external_id(0)
if sys_id==None:
internal=1
ent_val=self.parse_ent_repltext()
else:
internal=0
if self.now_at("NDATA"):
self.report_error(3002)
else:
self.skip_ws()
if not internal and self.now_at("NDATA"):
# Parsing the optional NDataDecl
if pedecl:
self.report_error(3035)
self.skip_ws()
ndata=self._get_name()
self.skip_ws()
else:
ndata=""
if not self.now_at(">"):
self.report_error(3005,">")
if pedecl:
# These are echoed to self.dtd so we remember this stuff
if internal:
self.dtd_consumer.new_parameter_entity(ent_name,ent_val)
if self.dtd!=self.dtd_consumer:
self.dtd.new_parameter_entity(ent_name,ent_val)
else:
self.dtd_consumer.new_external_pe(ent_name,pub_id,sys_id)
if self.dtd!=self.dtd_consumer:
self.dtd.new_external_pe(ent_name,pub_id,sys_id)
else:
if internal:
self.dtd_consumer.new_general_entity(ent_name,ent_val)
else:
self.dtd_consumer.new_external_entity(ent_name,pub_id,sys_id,ndata)
def parse_ent_repltext(self):
"""Parses an entity replacement text and resolves all character
entity and parameter entity references in it."""
val=""
if self.now_at('"'):
delim='"'
elif self.now_at("'"):
delim="'"
else:
self.report_error(3004,("'","\""))
self.scan_to(">")
return
return self.parse_ent_litval(self.scan_to(delim))
def parse_ent_litval(self,litval):
pos=0
val=""
while 1:
res=reg_litval_stop.search(litval,pos)
if res==None:
break
val=val+litval[pos:res.start(0)]
pos=res.start(0)
if litval[pos:pos+2]=="&#":
endpos=string_find(litval,";",pos)
if endpos==-1:
self.report_error(3005,";")
break
if litval[pos+2]=="x":
digs=unhex(litval[pos+3:endpos])
else:
digs=int(litval[pos+2:endpos])
if not (digs==9 or digs==10 or digs==13 or \
(digs>=32 and digs<=255)):
if digs>255:
self.report_error(1005,digs)
else:
self.report_error(3018,digs)
else:
val=val+chr(digs)
pos=endpos+1
elif litval[pos]=="%":
endpos=string_find(litval,";",pos)
if endpos==-1:
self.report_error(3005,";")
break
name=litval[pos+1:endpos]
try:
ent=self.dtd.resolve_pe(name)
if ent.is_internal():
val=val+self.parse_ent_litval(ent.value)
else:
self.report_error(3037) # FIXME: Easily solved now...?
except KeyError,e:
self.report_error(3038,name)
pos=endpos+1
else:
self.report_error(4001)
break
return val+litval[pos:]
def parse_notation(self):
"Parses a notation declaration."
self.skip_ws(1)
name=self._get_name()
self.skip_ws(1)
(pubid,sysid)=self.parse_external_id(1,0)
self.skip_ws()
if not self.now_at(">"):
self.report_error(3005,">")
self.dtd_consumer.new_notation(name,pubid,sysid)
def parse_pe_ref(self):
"Parses a reference to a parameter entity."
name=self.get_match(reg_pe_ref)[1:-1]
try:
ent=self.dtd.resolve_pe(name)
except KeyError,e:
self.report_error(3038,name)
return
if ent.is_internal():
self.push_entity(self.get_current_sysid(),ent.value)
self.do_parse()
self.pop_entity()
else:
sysid=self.pubres.resolve_pe_pubid(ent.get_pubid(),
ent.get_sysid())
int=self.internal
self.set_internal(0)
try:
self.open_entity(sysid) # Does parsing and popping
finally:
self.set_internal(int)
def parse_attlist(self):
"Parses an attribute list declaration."
self.skip_ws(1)
elem=self._get_name()
self.skip_ws(1)
while not self.test_str(">"):
attr=self._get_name()
self.skip_ws(1)
if self.test_reg(reg_attr_type):
a_type=self.get_match(reg_attr_type)
elif self.now_at("NOTATION"):
self.skip_ws(1)
a_type=("NOTATION",self.__parse_list(reg_name,"|"))
elif self.now_at("("):
self.pos=self.pos-1 # Does not expect '(' to be skipped
a_type=self.__parse_list(reg_nmtoken,"|")
tokens={}
for token in a_type:
if tokens.has_key(token):
self.report_error(3044,(token,))
else:
tokens[token]=1
else:
self.report_error(3039)
self.scan_to(">")
return
self.skip_ws(1)
if self.test_str("\"") or self.test_str("'"):
a_decl="#DEFAULT"
a_def=self.parse_ent_repltext()
elif self.now_at("#IMPLIED"):
a_decl="#IMPLIED"
a_def=None
elif self.now_at("#REQUIRED"):
a_decl="#REQUIRED"
a_def=None
elif self.now_at("#FIXED"):
self.skip_ws(1)
a_decl="#FIXED"
a_def=self.parse_ent_repltext()
else:
self.report_error(3909)
a_decl=None
a_def=None
self.skip_ws()
self.dtd_consumer.new_attribute(elem,attr,a_type,a_decl,a_def)
self.pos=self.pos+1 # Skipping the '>'
def parse_elem_type(self):
"Parses an element type declaration."
self.skip_ws(1)
#elem_name=self.get_match(reg_name)
elem_name=self._get_name()
self.skip_ws(1)
# content-spec
if self.now_at("EMPTY"):
elem_cont="EMPTY"
elif self.now_at("ANY"):
elem_cont="ANY"
elif self.now_at("("):
elem_cont=self._parse_content_model()
else:
self.report_error(3004,("EMPTY, ANY","("))
elem_cont="ANY" # Just so things don't fall apart downstream
self.skip_ws()
if not self.now_at(">"):
self.report_error(3005,">")
self.dtd_consumer.new_element_type(elem_name,elem_cont)
def _parse_content_model(self,level=0):
"""Parses the content model of an element type declaration. Level
tells the function if we are on the top level (=0) or not (=1).
The '(' has just been passed over, we read past the ')'. Returns
a tuple (separator, contents, modifier), where content consists
of (cp, modifier) tuples and cp can be a new content model tuple."""
self.skip_ws()
# Creates a content list with separator first
cont_list=[]
sep=""
if self.now_at("#PCDATA") and level==0:
return self.parse_mixed_content_model()
while 1:
self.skip_ws()
if self.now_at("("):
cp=self._parse_content_model(1)
else:
cp=self._get_name()
if self.test_str("?") or self.test_str("*") or self.test_str("+"):
mod=self.data[self.pos]
self.pos=self.pos+1
else:
mod=""
if type(cp)==StringType:
cont_list.append((cp,mod))
else:
cont_list.append(cp)
self.skip_ws()
if self.now_at(")"):
break
if sep=="":
if self.test_str("|") or self.test_str(","):
sep=self.data[self.pos]
else:
self.report_error(3004,("'|'",","))
self.pos=self.pos+1
else:
if not self.now_at(sep):
self.report_error(3040)
self.scan_to(")")
if self.test_str("+") or self.test_str("?") or self.test_str("*"):
mod=self.data[self.pos]
self.pos=self.pos+1
else:
mod=""
return (sep,cont_list,mod)
def parse_mixed_content_model(self):
"Parses mixed content models. Ie: ones containing #PCDATA."
cont_list=[("#PCDATA","")]
sep=""
mod=""
while 1:
try:
self.skip_ws()
except OutOfDataException,e:
raise e
if self.now_at("|"):
sep="|"
elif self.now_at(")"):
break
else:
self.report_error(3005,"|")
self.scan_to(">")
self.skip_ws()
cont_list.append((self.get_match(reg_name),""))
if self.now_at("*"):
mod="*"
elif sep=="|":
self.report_error(3005,"*")
return (sep,cont_list,mod)
def parse_conditional(self):
"Parses a conditional section."
if self.internal:
self.report_error(3041)
ignore=1
self.scan_to("]]>")
else:
self.skip_ws()
if self.now_at("IGNORE"):
self.ignores_entered=1
self.skip_ws()
if not self.now_at("["):
self.report_error(3005,"[")
self.parse_ignored_data()
return
if not self.now_at("INCLUDE"):
self.report_error(3004,("'IGNORE'","INCLUDE"))
self.scan_to("[")
self.includes_entered=self.includes_entered+1
self.skip_ws()
if not self.now_at("["):
self.report_error(3005,"[")
# Doing an extra skip_ws and waiting until we get here
# before increasing the include count, to avoid increasing
# the count inside a PE, where it would be forgotten after pop.
self.skip_ws()
self.includes_entered=self.includes_entered+1
def parse_ignored_data(self):
try:
counter=self.ignores_entered
while counter:
self.find_reg(reg_cond_sect)
if self.now_at("]]>"):
counter=counter-1
else:
counter=counter+1
self.pos=self.pos+3
except OutOfDataException,e:
if self.final:
self.report_error(3043)
self.ignores_entered=counter
self.data=""
self.pos=0
self.datasize=0
raise e
self.ignores_entered=0
def __parse_list(self, elem_regexp, separator):
"Parses a '(' S? elem_regexp S? separator ... ')' list. (Internal.)"
list=[]
self.skip_ws()
if not self.now_at("("):
self.report_error(3005,"(")
while 1:
self.skip_ws()
list.append(self.get_match(elem_regexp))
self.skip_ws()
if self.now_at(")"):
break
elif not self.now_at(separator):
self.report_error(3004,("')'",separator))
break
return list
def is_external(self):
return not self.internal
# --- Internal methods
def _push_ent_stack(self,name="None"):
EntityParser._push_ent_stack(self,name)
self.own_ent_stack.append(self.includes_entered)
self.includes_entered=0
def _pop_ent_stack(self):
EntityParser._pop_ent_stack(self)
self.includes_entered=self.own_ent_stack[-1]
del self.own_ent_stack[-1]
# --- Minimal DTD consumer
class DTDConsumerPE(DTDConsumer):
def __init__(self):
DTDConsumer.__init__(self,None)
self.param_ents={}
self.used_notations = {}
def new_parameter_entity(self,name,val):
if not self.param_ents.has_key(name): #Keep first decl
self.param_ents[name]=InternalEntity(name,val)
def new_external_pe(self,name,pubid,sysid):
if not self.param_ents.has_key(name): # Keep first decl
self.param_ents[name]=ExternalEntity(name,pubid,sysid,"")
def resolve_pe(self,name):
return self.param_ents[name]
def reset(self):
self.param_ents={}
|