1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
|
# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
Contains a class that makes it simple to stream out well-formed and
nicely-indented XML.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from ...extern import six
# STDLIB
import contextlib
import textwrap
try:
import bleach
HAS_BLEACH = True
except ImportError:
HAS_BLEACH = False
try:
from . import _iterparser
except ImportError:
def xml_escape_cdata(s):
"""
Escapes &, < and > in an XML CDATA string.
"""
s = s.replace("&", "&")
s = s.replace("<", "<")
s = s.replace(">", ">")
return s
def xml_escape(s):
"""
Escapes &, ', ", < and > in an XML attribute value.
"""
s = s.replace("&", "&")
s = s.replace("'", "'")
s = s.replace("\"", """)
s = s.replace("<", "<")
s = s.replace(">", ">")
return s
else:
xml_escape_cdata = _iterparser.escape_xml_cdata
xml_escape = _iterparser.escape_xml
class XMLWriter:
"""
A class to write well-formed and nicely indented XML.
Use like this::
w = XMLWriter(fh)
with w.tag('html'):
with w.tag('body'):
w.data('This is the content')
Which produces::
<html>
<body>
This is the content
</body>
</html>
"""
def __init__(self, file):
"""
Parameters
----------
file : writable file-like object.
"""
self.write = file.write
if hasattr(file, "flush"):
self.flush = file.flush
self._open = 0 # true if start tag is open
self._tags = []
self._data = []
self._indentation = " " * 64
self.xml_escape_cdata = xml_escape_cdata
self.xml_escape = xml_escape
def _flush(self, indent=True, wrap=False):
"""
Flush internal buffers.
"""
if self._open:
if indent:
self.write(">\n")
else:
self.write(">")
self._open = 0
if self._data:
data = ''.join(self._data)
if wrap:
indent = self.get_indentation_spaces(1)
data = textwrap.fill(
data,
initial_indent=indent,
subsequent_indent=indent)
self.write('\n')
self.write(self.xml_escape_cdata(data))
self.write('\n')
self.write(self.get_indentation_spaces())
else:
self.write(self.xml_escape_cdata(data))
self._data = []
def start(self, tag, attrib={}, **extra):
"""
Opens a new element. Attributes can be given as keyword
arguments, or as a string/string dictionary. The method
returns an opaque identifier that can be passed to the
:meth:`close` method, to close all open elements up to and
including this one.
Parameters
----------
tag : str
The element name
attrib : dict of str -> str
Attribute dictionary. Alternatively, attributes can
be given as keyword arguments.
Returns
-------
id : int
Returns an element identifier.
"""
self._flush()
# This is just busy work -- we know our tag names are clean
# tag = xml_escape_cdata(tag)
self._data = []
self._tags.append(tag)
self.write(self.get_indentation_spaces(-1))
self.write("<{}".format(tag))
if attrib or extra:
attrib = attrib.copy()
attrib.update(extra)
attrib = list(six.iteritems(attrib))
attrib.sort()
for k, v in attrib:
if v is not None:
# This is just busy work -- we know our keys are clean
# k = xml_escape_cdata(k)
v = self.xml_escape(v)
self.write(" {}=\"{}\"".format(k, v))
self._open = 1
return len(self._tags)
@contextlib.contextmanager
def xml_cleaning_method(self, method='escape_xml', **clean_kwargs):
"""Context manager to control how XML data tags are cleaned (escaped) to
remove potentially unsafe characters or constructs.
The default (``method='escape_xml'``) applies brute-force escaping of
certain key XML characters like ``<``, ``>``, and ``&`` to ensure that
the output is not valid XML.
In order to explicitly allow certain XML tags (e.g. link reference or
emphasis tags), use ``method='bleach_clean'``. This sanitizes the data
string using the ``clean`` function of the
`http://bleach.readthedocs.io/en/latest/clean.html <bleach>`_ package.
Any additional keyword arguments will be passed directly to the
``clean`` function.
Example::
w = writer.XMLWriter(ListWriter(lines))
with w.xml_cleaning_method('bleach_clean'):
w.start('td')
w.data('<a href="http://google.com">google.com</a>')
w.end()
Parameters
----------
method : str
Cleaning method. Allowed values are "escape_xml" and
"bleach_clean".
**clean_kwargs : keyword args
Additional keyword args that are passed to the
bleach.clean() function.
"""
current_xml_escape_cdata = self.xml_escape_cdata
if method == 'bleach_clean':
if HAS_BLEACH:
if clean_kwargs is None:
clean_kwargs = {}
self.xml_escape_cdata = lambda x: bleach.clean(x, **clean_kwargs)
else:
raise ValueError('bleach package is required when HTML escaping is disabled.\n'
'Use "pip install bleach".')
elif method != 'escape_xml':
raise ValueError('allowed values of method are "escape_xml" and "bleach_clean"')
yield
self.xml_escape_cdata = current_xml_escape_cdata
@contextlib.contextmanager
def tag(self, tag, attrib={}, **extra):
"""
A convenience method for creating wrapper elements using the
``with`` statement.
Examples
--------
>>> with writer.tag('foo'): # doctest: +SKIP
... writer.element('bar')
... # </foo> is implicitly closed here
...
Parameters are the same as to `start`.
"""
self.start(tag, attrib, **extra)
yield
self.end(tag)
def comment(self, comment):
"""
Adds a comment to the output stream.
Parameters
----------
comment : str
Comment text, as a Unicode string.
"""
self._flush()
self.write(self.get_indentation_spaces())
self.write("<!-- {} -->\n".format(self.xml_escape_cdata(comment)))
def data(self, text):
"""
Adds character data to the output stream.
Parameters
----------
text : str
Character data, as a Unicode string.
"""
self._data.append(text)
def end(self, tag=None, indent=True, wrap=False):
"""
Closes the current element (opened by the most recent call to
`start`).
Parameters
----------
tag : str
Element name. If given, the tag must match the start tag.
If omitted, the current element is closed.
"""
if tag:
assert self._tags, "unbalanced end({})".format(tag)
assert tag == self._tags[-1],\
"expected end({}), got {}".format(self._tags[-1], tag)
else:
assert self._tags, "unbalanced end()"
tag = self._tags.pop()
if self._data:
self._flush(indent, wrap)
elif self._open:
self._open = 0
self.write("/>\n")
return
if indent:
self.write(self.get_indentation_spaces())
self.write("</{}>\n".format(tag))
def close(self, id):
"""
Closes open elements, up to (and including) the element identified
by the given identifier.
Parameters
----------
id : int
Element identifier, as returned by the `start` method.
"""
while len(self._tags) > id:
self.end()
def element(self, tag, text=None, wrap=False, attrib={}, **extra):
"""
Adds an entire element. This is the same as calling `start`,
`data`, and `end` in sequence. The ``text`` argument
can be omitted.
"""
self.start(tag, attrib, **extra)
if text:
self.data(text)
self.end(indent=False, wrap=wrap)
def flush(self):
pass # replaced by the constructor
def get_indentation(self):
"""
Returns the number of indentation levels the file is currently
in.
"""
return len(self._tags)
def get_indentation_spaces(self, offset=0):
"""
Returns a string of spaces that matches the current
indentation level.
"""
return self._indentation[:len(self._tags) + offset]
@staticmethod
def object_attrs(obj, attrs):
"""
Converts an object with a bunch of attributes on an object
into a dictionary for use by the `XMLWriter`.
Parameters
----------
obj : object
Any Python object
attrs : sequence of str
Attribute names to pull from the object
Returns
-------
attrs : dict
Maps attribute names to the values retrieved from
``obj.attr``. If any of the attributes is `None`, it will
not appear in the output dictionary.
"""
d = {}
for attr in attrs:
if getattr(obj, attr) is not None:
d[attr.replace('_', '-')] = six.text_type(getattr(obj, attr))
return d
|