1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
|
#
# qp_xml: Quick Parsing for XML
#
# Written by Greg Stein. Public Domain.
# No Copyright, no Rights Reserved, and no Warranties.
#
# This module is maintained by Greg and is available as part of the XML-SIG
# distribution. This module and its changelog can be fetched at:
# http://www.lyra.org/cgi-bin/viewcvs.cgi/xml/xml/utils/qp_xml.py
#
# Additional information can be found on Greg's Python page at:
# http://www.lyra.org/greg/python/
#
# This module was added to the XML-SIG distribution on February 14, 2000.
# As part of that distribution, it falls under the XML distribution license.
#
import string
try:
import pyexpat
except ImportError:
from xml.parsers import pyexpat
error = __name__ + '.error'
#
# The parsing class. Instantiate and pass a string/file to .parse()
#
class Parser:
def __init__(self):
self.reset()
def reset(self):
self.root = None
self.cur_elem = None
def find_prefix(self, prefix):
elem = self.cur_elem
while elem:
if elem.ns_scope.has_key(prefix):
return elem.ns_scope[prefix]
elem = elem.parent
if prefix == '':
return '' # empty URL for "no namespace"
return None
def process_prefix(self, name, use_default):
idx = string.find(name, ':')
if idx == -1:
if use_default:
return self.find_prefix(''), name
return '', name # no namespace
if string.lower(name[:3]) == 'xml':
return '', name # name is reserved by XML. don't break out a NS.
ns = self.find_prefix(name[:idx])
if ns is None:
raise error, 'namespace prefix not found'
return ns, name[idx+1:]
def start(self, name, attrs):
elem = _element(name=name, lang=None, parent=None,
children=[], ns_scope={}, attrs={},
first_cdata='', following_cdata='')
if self.cur_elem:
elem.parent = self.cur_elem
elem.parent.children.append(elem)
self.cur_elem = elem
else:
self.cur_elem = self.root = elem
work_attrs = [ ]
# scan for namespace declarations (and xml:lang while we're at it)
for name, value in attrs.items():
if name == 'xmlns':
elem.ns_scope[''] = value
elif name[:6] == 'xmlns:':
elem.ns_scope[name[6:]] = value
elif name == 'xml:lang':
elem.lang = value
else:
work_attrs.append((name, value))
# inherit xml:lang from parent
if elem.lang is None and elem.parent:
elem.lang = elem.parent.lang
# process prefix of the element name
elem.ns, elem.name = self.process_prefix(elem.name, 1)
# process attributes' namespace prefixes
for name, value in work_attrs:
elem.attrs[self.process_prefix(name, 0)] = value
def end(self, name):
parent = self.cur_elem.parent
del self.cur_elem.ns_scope
del self.cur_elem.parent
self.cur_elem = parent
def cdata(self, data):
elem = self.cur_elem
if elem.children:
last = elem.children[-1]
last.following_cdata = last.following_cdata + data
else:
elem.first_cdata = elem.first_cdata + data
def parse(self, input):
self.reset()
p = pyexpat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.cdata
try:
if type(input) == type(''):
p.Parse(input, 1)
else:
while 1:
s = input.read(_BLOCKSIZE)
if not s:
p.Parse('', 1)
break
p.Parse(s, 0)
finally:
if self.root:
_clean_tree(self.root)
return self.root
#
# handy function for dumping a tree that is returned by Parser
#
def dump(f, root):
f.write('<?xml version="1.0"?>\n')
namespaces = _collect_ns(root)
_dump_recurse(f, root, namespaces, dump_ns=1)
f.write('\n')
#
# This function returns the element's CDATA. Note: this is not recursive --
# it only returns the CDATA immediately within the element, excluding the
# CDATA in child elements.
#
def textof(elem):
return elem.textof()
#########################################################################
#
# private stuff for qp_xml
#
_BLOCKSIZE = 16384 # chunk size for parsing input
class _element:
def __init__(self, **kw):
self.__dict__.update(kw)
def textof(self):
'''Return the CDATA of this element.
Note: this is not recursive -- it only returns the CDATA immediately
within the element, excluding the CDATA in child elements.
'''
s = self.first_cdata
for child in self.children:
s = s + child.following_cdata
return s
def find(self, name, ns=''):
for elem in self.children:
if elem.name == name and elem.ns == ns:
return elem
return None
def _clean_tree(elem):
elem.parent = None
del elem.parent
map(_clean_tree, elem.children)
def _collect_recurse(elem, dict):
dict[elem.ns] = None
for ns, name in elem.attrs.keys():
dict[ns] = None
for child in elem.children:
_collect_recurse(child, dict)
def _collect_ns(elem):
"Collect all namespaces into a NAMESPACE -> PREFIX mapping."
d = { '' : None }
_collect_recurse(elem, d)
del d[''] # make sure we don't pick up no-namespace entries
keys = d.keys()
for i in range(len(keys)):
d[keys[i]] = i
return d
def _dump_recurse(f, elem, namespaces, lang=None, dump_ns=0):
if elem.ns:
f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name))
else:
f.write('<' + elem.name)
for (ns, name), value in elem.attrs.items():
if ns:
f.write(' ns%d:%s="%s"' % (namespaces[ns], name, value))
else:
f.write(' %s="%s"' % (name, value))
if dump_ns:
for ns, id in namespaces.items():
f.write(' xmlns:ns%d="%s"' % (id, ns))
if elem.lang != lang:
f.write(' xml:lang="%s"' % elem.lang)
if elem.children or elem.first_cdata:
f.write('>' + elem.first_cdata)
for child in elem.children:
_dump_recurse(f, child, namespaces, elem.lang)
f.write(child.following_cdata)
if elem.ns:
f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name))
else:
f.write('</%s>' % elem.name)
else:
f.write('/>')
|