1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
|
#!/usr/bin/python3
"""Simple XML element sorter.
This module can be used by importing `sort_xml` or by running standalone from the command-line.
"""
# Copyright (c) 2022, Chris Koch <kopachris@gmail.com>
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# (1) Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# (2) Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# (3)The name of the author may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
__version__ = (0, 1, 0)
__version_str__ = '.'.join([str(v) for v in __version__])
__description__ = """
A simple XML element sorter. Will sort the children of selected elements
using a given attribute's value or subelement's text as the sort key.
Example usage:
$ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl
"""
import argparse as ap
import xml.etree.ElementTree as ET
from pathlib import Path
from io import TextIOWrapper
from codecs import BOM_UTF8
from decimal import Decimal
from dateutil.parser import parse as parse_dt
class NSElement(ET.Element):
"""Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available."""
def __init__(self, *args, **kwargs):
self._ns_map = dict()
self._builder = None
if 'builder' in kwargs:
builder = kwargs.pop('builder')
self._builder = builder
if hasattr(builder, 'ns_map'):
self._ns_map = builder.ns_map
super().__init__(*args, **kwargs)
def find(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().find(path, namespaces)
def findall(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().findall(path, namespaces)
def findtext(self, path, default=None, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().findtext(path, default, namespaces)
def iterfind(self, path, namespaces=None):
if namespaces is None:
namespaces = self._ns_map
return super().iterfind(path, namespaces)
class NSTreeBuilder(ET.TreeBuilder):
"""Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry."""
def __init__(self, **kwargs):
self.ns_map = dict()
if 'element_factory' in kwargs:
del kwargs['element_factory']
super().__init__(element_factory=NSElement, **kwargs)
def start_ns(self, prefix, uri):
self.ns_map[prefix] = uri
ET.register_namespace(prefix, uri)
def start(self, tag, attrs):
if self._factory is NSElement:
self._flush()
self._last = e = self._factory(tag, attrs, builder=self)
if self._elem:
self._elem[-1].append(e)
elif self._root is None:
self._root = e
self._elem.append(e)
self._tail = 0
return e
else:
return super().start(tag, attrs)
def _handle_single(self, factory, insert, *args):
if factory is NSElement:
e = factory(*args, builder=self)
if insert:
self._flush()
self._last = e
if self._elem:
self._elem[-1].append(e)
self._tail = 1
return e
else:
return super()._handle_single(factory, insert, *args)
def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False,
descending=False):
"""Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the
resulting whole document. ElementTree can easily be converted to string or written to a file like so:
>>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot())
>>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml')
Required arguments:
-------------------
* `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML
file, string representing the file path, or string containing the file contents of a valid XML file. Can't take
an ElementTree instance because we need to use our own parser to keep track of namespaces.
* `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language
of the etree module
* `sort_attr` -- the attribute of the child elements to use as the sort key
Optional arguments:
-------------------
* `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the
sort key (default: False)
* `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort
chronologically (default: False, mutually exclusive with `sort_as_decimal`)
* `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep
'10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`)
* `descending` -- sort in descending order instead of ascending (default: False)
"""
# check parameters
# xml_doc
if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable():
# xml_doc is a readable text stream, let's read it
# but first make sure to remove any byte order marker
if xml_doc.encoding != 'utf-8-sig':
xml_doc.reconfigure(encoding='utf-8-sig')
xml_str = xml_doc.read()
elif isinstance(xml_doc, Path) and xml_doc.is_file():
# xml_doc is a Path object to a file
xml_str = xml_doc.read_text('utf-8-sig') # utf-8-sig to remove byte order marker
elif isinstance(xml_doc, str) and Path(xml_doc).is_file():
# xml_doc is a filename
xml_str = Path(xml_doc).read_text('utf-8-sig')
elif isinstance(xml_doc, str) and len(xml_doc) > 0:
# xml_doc hopefully contains valid XML
if xml_doc.startswith(BOM_UTF8.decode('utf-8')):
xml_str = xml_doc[3:]
else:
xml_str = xml_doc
else:
raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a "
f"valid xml file! xml_doc: {repr(xml_doc)}")
# sort_attr
if not (isinstance(sort_attr, str) and len(sort_attr) > 0):
raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t"
f"sort_attr: {repr(sort_attr)}")
else:
sort_attr = sort_attr.strip()
if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')):
raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t"
f"sort_attr: {repr(sort_attr)}")
# make our element tree using our custom treebuilder and get all the parents we have to sort children of
dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder()))
matching_parents = dom.findall(node_path)
# check what kind of sorting we're doing and do it
# TODO might be faster if we do the check once and then run the appropriate for loop?
for par in matching_parents:
if use_text:
if sort_as_datetime:
par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending)
elif sort_as_decimal:
par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending)
else:
par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending)
elif sort_as_datetime:
par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending)
elif sort_as_decimal:
par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending)
else:
par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending)
return ET.ElementTree(dom)
if __name__ == '__main__':
argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter)
argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}")
argp.add_argument('input_file', type=Path, help="File path to the source xml file.")
argp.add_argument('sort_xpath',
help="XPath-style selector for elements to sort the children of. This has the same limitations "
"as Python's ElementTree module.")
argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.")
argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending',
help="Sort the child elements in reverse (descending) order.")
argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text',
help="Treat the sort attribute name as the name of a subelement whose text is the sort key.")
sort_style = argp.add_mutually_exclusive_group()
sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime',
help="Try to parse the sort key as a date/time value. Mutually exclusive with --decimal.")
sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal',
help="Try to parse the sort key as a decimal number. Mutually exclusive with --datetime.")
argp.add_argument('-o', '--output', type=Path, dest='output_file',
help="File path to the destination file. (Default is to append '_sorted' to the filename.)")
argv = argp.parse_args()
xml_doc = argv.input_file
sort_path = argv.sort_xpath
sort_attr = argv.sort_attr
sort_desc = argv.descending
use_text = argv.use_text
as_dt = argv.as_datetime
as_dec = argv.as_decimal
sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc)
if not hasattr(argv, 'output_file'):
new_filename = xml_doc.stem + '_sorted'
out_file = xml_doc.with_stem(new_filename)
else:
out_file = argv.output_file
out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8')
print(f"Output sorted file as `{out_file}`")
|