1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
This module includes a fast iterator-based XML parser.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from ...extern import six
# STDLIB
import contextlib
import io
import sys
# ASTROPY
from .. import data
__all__ = ['get_xml_iterator', 'get_xml_encoding', 'xml_readlines']
@contextlib.contextmanager
def _convert_to_fd_or_read_function(fd):
"""
Returns a function suitable for streaming input, or a file object.
This function is only useful if passing off to C code where:
- If it's a real file object, we want to use it as a real
C file object to avoid the Python overhead.
- If it's not a real file object, it's much handier to just
have a Python function to call.
This is somewhat quirky behavior, of course, which is why it is
private. For a more useful version of similar behavior, see
`astropy.utils.misc.get_readable_fileobj`.
Parameters
----------
fd : object
May be:
- a file object. If the file is uncompressed, this raw
file object is returned verbatim. Otherwise, the read
method is returned.
- a function that reads from a stream, in which case it is
returned verbatim.
- a file path, in which case it is opened. Again, like a
file object, if it's uncompressed, a raw file object is
returned, otherwise its read method.
- an object with a :meth:`read` method, in which case that
method is returned.
Returns
-------
fd : context-dependent
See above.
"""
if six.callable(fd):
yield fd
return
with data.get_readable_fileobj(fd, encoding='binary') as new_fd:
if sys.platform.startswith('win'):
yield new_fd.read
else:
if six.PY2:
if isinstance(new_fd, file):
yield new_fd
else:
yield new_fd.read
else:
if isinstance(new_fd, io.FileIO):
yield new_fd
else:
yield new_fd.read
def _fast_iterparse(fd, buffersize=2 ** 10):
from xml.parsers import expat
if not six.callable(fd):
read = fd.read
else:
read = fd
queue = []
text = []
def start(name, attr):
queue.append((True, name, attr,
(parser.CurrentLineNumber, parser.CurrentColumnNumber)))
del text[:]
if sys.version_info[:3] < (2, 6, 5): # pragma py2
# Due to Python issue #4978, convert all keys to byte strings
_start = start
def start(name, attr):
attr = dict((k.encode('utf-8'), v) for (k, v) in six.iteritems(attr))
return _start(name, attr)
def end(name):
queue.append((False, name, ''.join(text).strip(),
(parser.CurrentLineNumber, parser.CurrentColumnNumber)))
parser = expat.ParserCreate()
if six.PY2:
parser.returns_unicode = True
parser.specified_attributes = True
parser.StartElementHandler = start
parser.EndElementHandler = end
parser.CharacterDataHandler = text.append
Parse = parser.Parse
data = read(buffersize)
while data:
Parse(data, False)
for elem in queue:
yield elem
del queue[:]
data = read(buffersize)
Parse('', True)
for elem in queue:
yield elem
# Try to import the C version of the iterparser, otherwise fall back
# to the Python implementation above.
_slow_iterparse = _fast_iterparse
try:
from . import _iterparser
_fast_iterparse = _iterparser.IterParser
except ImportError:
pass
@contextlib.contextmanager
def get_xml_iterator(source, _debug_python_based_parser=False):
"""
Returns an iterator over the elements of an XML file.
The iterator doesn't ever build a tree, so it is much more memory
and time efficient than the alternative in ``cElementTree``.
Parameters
----------
fd : readable file-like object or read function
Returns
-------
parts : iterator
The iterator returns 4-tuples (*start*, *tag*, *data*, *pos*):
- *start*: when `True` is a start element event, otherwise
an end element event.
- *tag*: The name of the element
- *data*: Depends on the value of *event*:
- if *start* == `True`, data is a dictionary of
attributes
- if *start* == `False`, data is a string containing
the text content of the element
- *pos*: Tuple (*line*, *col*) indicating the source of the
event.
"""
with _convert_to_fd_or_read_function(source) as fd:
if _debug_python_based_parser:
context = _slow_iterparse(fd)
else:
context = _fast_iterparse(fd)
yield iter(context)
def get_xml_encoding(source):
"""
Determine the encoding of an XML file by reading its header.
Parameters
----------
source : readable file-like object, read function or str path
Returns
-------
encoding : str
"""
with get_xml_iterator(source) as iterator:
start, tag, data, pos = six.next(iterator)
if not start or tag != 'xml':
raise IOError('Invalid XML file')
# The XML spec says that no encoding === utf-8
return data.get('encoding') or 'utf-8'
def xml_readlines(source):
"""
Get the lines from a given XML file. Correctly determines the
encoding and always returns unicode.
Parameters
----------
source : readable file-like object, read function or str path
Returns
-------
lines : list of unicode
"""
encoding = get_xml_encoding(source)
with data.get_readable_fileobj(source, encoding=encoding) as input:
input.seek(0)
xml_lines = input.readlines()
return xml_lines
|