1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
|
# A parser for the iRODS XML-like protocol.
# The interface aims to be compatible with xml.etree.ElementTree,
# at least for the features used by python-irodsclient.
class Element:
"""
Represents <name>body</name>.
(Where `body' is either a string or a list of sub-elements.)
"""
@property
def tag(self):
return self.name
def __init__(self, name, body):
"""Initialize with the tag's name and the body (i.e. content)."""
if body == []:
# Empty element.
self.text = None
elif type(body) is not list:
# String element: decode body.
body = decode_entities(body)
self.text = body
self.name = name
self.body = body
def find(self, name):
"""Get first matching child element by name."""
for x in self.findall(name):
return x
def findall(self, name):
"""Get matching child elements by name."""
return list(self.findall_(name))
def findall_(self, name):
"""Get matching child elements by name (generator variant)."""
return (el for el in self.body if el.name == name)
# For debugging convenience:
def __str__(self):
if type(self.body) is list:
return "<{}>{}</{}>".format(
self.name, "".join(map(str, self.body)), self.name
)
else:
return "<{}>{}</{}>".format(
self.name, encode_entities(self.body), self.name
)
def __repr__(self):
return "{}({})".format(self.name, repr(self.body))
class Token:
"""A utility class for parsing XML."""
def __init__(self, s):
"""Create a `Token' object from `s', the text comprising the parsed token."""
self.text = s
def __repr__(self):
return str(type(self).__name__) + "(" + self.text.decode("utf-8") + ")"
def __str__(self):
return repr(self)
class TokenTagOpen(Token):
"""An opening tag (<foo>)"""
class TokenTagClose(Token):
"""An closing tag (</foo>)"""
class TokenCData(Token):
"""Textual element body"""
class QuasiXmlParseError(Exception):
"""Indicates parse failure of XML protocol data."""
def tokenize(s):
"""Parse an XML-ish string into a list of tokens."""
tokens = []
# Consume input until empty.
while True:
nextclose = s.find(b"</")
nextopen = s.find(b"<")
if nextopen < nextclose or nextopen == -1:
# Either we have no tags left, or we are in a non-cdata element body: strip whitespace.
s = s.lstrip()
if len(s) == 0:
return tokens
# Closing tag?
elif s.startswith(b"</"):
try:
name, s = s[2:].split(b">", 1)
except Exception:
raise QuasiXmlParseError("protocol error: unterminated close tag")
tokens.append(TokenTagClose(name))
s = s.lstrip() # consume space after closing tag
# Opening tag?
elif s.startswith(b"<"):
try:
name, s = s[1:].split(b">", 1)
except Exception:
raise QuasiXmlParseError("protocol error: unterminated open tag")
tokens.append(TokenTagOpen(name))
else:
# capture cdata till next tag.
try:
cdata, s = s.split(b"<", 1)
except Exception:
raise QuasiXmlParseError("protocol error: unterminated cdata")
s = b"<" + s
tokens.append(TokenCData(cdata))
def fromtokens(tokens):
"""Parse XML-ish tokens into an Element."""
def parse_elem(tokens):
"""Parse some tokens into one Element, and return unconsumed tokens."""
topen, tokens = tokens[0], tokens[1:]
if type(topen) is not TokenTagOpen:
raise QuasiXmlParseError(
"protocol error: data does not start with open tag"
)
children = []
cdata = None
while len(tokens) > 0:
t, tokens = tokens[0], tokens[1:]
if type(t) is TokenTagOpen:
# Slurp a sub-element.
el, tokens = parse_elem([t] + tokens)
children.append(el)
# Continue with non-consumed tokens.
elif type(t) == TokenTagClose:
if t.text != topen.text:
raise QuasiXmlParseError(
"protocol error: close tag <{}> does not match opening tag <{}>".format(
t.text, topen.text
)
)
elif cdata is not None and len(children):
raise QuasiXmlParseError(
"protocol error: mixed cdata and child elements"
)
return (
Element(
topen.text.decode("utf-8"),
cdata.decode("utf-8") if cdata is not None else children,
),
tokens,
)
else:
cdata = t.text
elem, rest = parse_elem(tokens)
if rest != []:
raise QuasiXmlParseError("protocol error: trailing data")
return elem
try:
unicode # Python 2
except NameError:
unicode = str
def fromstring(s):
if type(s) is unicode:
s = s.encode("utf-8")
if type(s) is not bytes:
raise TypeError("expected a bytes-object, got {}".format(type(s).__name__))
return fromtokens(tokenize(s))
def encode_entities(s):
from . import XML_entities_active
for k, v in XML_entities_active():
s = s.replace(k, v)
return s
def decode_entities(s):
from . import XML_entities_active
rev = list(XML_entities_active())
rev.reverse() # (make sure & is decoded last)
for k, v in rev:
s = s.replace(v, k)
return s
|