1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
|
#!/usr/bin/env python
"""
A command line tool for running reports on XML files.
trimxslt allows you to rapidly extract details from large XML files
on the command line.
Run "trimxslt --help" for details of the command line parameters, but
here are some pointers to get you started.
Let's say you have a simple database dump format with the following
form:
<db>
<record id="1">
<name>Alex</name>
<address>123 Maple St.</address>
</record>
<record id="2">
<name>Bob</name>
<address>456 Birch Rd.</address>
</record>
<record id="3">
<name>Chris</name>
<address>789 Pine St.</address>
</record>
</db>
You can:
Get all the full contents of name elements
$ trimxml file.xml name
<name>Alex</name>
<name>Bob</name>
<name>Chris</name>
Get the full contents of the record with ID 2
$ trimxml file.xml record "@id='2'"
<record id="2">
<name>Bob</name>
<address>456 Birch Rd.</address>
</record>
Get the full contents of the first two name elements
$ trimxml -c 2 file.xml name
<name>Alex</name>
<name>Bob</name>
Get the name of the record with ID 2
$ trimxml -d "name" file.xml record "@id='2'"
<name>Bob</name>
You could display the id and each correspoding name as follows:
$ trimxml file.xml "@id|name"
1
<name>Alex</name>
2
<name>Bob</name>
3
<name>Chris</name>
Or a more precise approach might be (demonstrating the use of XPath functions):
$ trimxml -d "concat(@id, ': ', name)" file.xml record
1: Alex
2: Bob
3: Chris
trimxml uses namespaces declared on the document element, so you can
conveniently make queries without needing to separately declare prefixes.
So to get the URLs of all a links in an XHTML document you could do:
trimxml -d "@href" file.xhtml "html:a"
As long as there is a namespace declaration
xmlns:ht="http://www.w3.org/1999/xhtml" in the document. If not
(many XHTML documents use the default namespace, which courtesy XPath 1.0
restrictions prevents trimxml from doing any guesswork for you) you have
to declare the prefix.
trimxml --ns=ht="http://www.w3.org/1999/xhtml" -d "@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "ht:a"
Notice how this example loads the source XML (XHTML) from a Web URL rather than a local file. Of course, a shortcut for this is simply:
trimxml http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"
"""
#The following won't work because EXSLT is only supported in XsltContext and we use Ft.Xml.XPath.Context
#We can probably revisit when we make bindery nodes subclasses of Domlette
#trimxml --ns=str="http://exslt.org/strings" -d "str:replace(., 'http://', '')" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"
import os
import re
import sys
import codecs
import optparse
#import cStringIO
import amara
from amara import saxtools
from xml.dom import Node
#from xml.dom import EMPTY_NAMESPACE as NULL_NAMESPACE
#from xml.dom import EMPTY_PREFIX as NULL_PREFIX
#FIXME: Use 4Suite L10N
def _(t): return t
def run(source, xpattern, xpath, limit, sentinel, display, prefixes):
prefixes = prefixes or {}
try:
prefixes = dict([ p.split('=') for p in prefixes ])
except ValueError:
raise ValueError("Invalid prefix declaration")
if hasattr(source, 'read'):
if hasattr(source, 'rewind'):
nss = saxtools.sniff_namespace(source)
source.rewind()
else:
source = source.read()
nss = saxtools.sniff_namespace(source)
else:
nss = saxtools.sniff_namespace(source)
nss.update(prefixes)
nodes = amara.pushbind(source, xpattern, prefixes=nss)
count = 0
for node in nodes:
if not xpath or node.xml_xpath(xpath):
count += 1
if display:
#Print specified subset
result = node.xml_xpath(display)
if isinstance(result, list):
print '\n'.join([ n.nodeType == Node.ATTRIBUTE_NODE and n.nodeValue or n.xml() for n in result ])
else:
print result
else:
#Print the whole thing
try:
print node.xml()
except AttributeError:
print unicode(node).encode('utf-8')
if limit != -1 and count >= limit:
break
if sentinel and node.xml_xpath(sentinel):
break
return
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def command_line_prep():
from optparse import OptionParser
usage = "%prog [options] source xpattern [xpath]"
parser = OptionParser(usage=usage)
parser.add_option("-c", "--limit",
action="store", type="int", dest="limit", default=-1,
help="limit the number of xpattern matches retrieved; files will not be parsed beyond this number, so it serves as optimization", metavar="NUMBER")
parser.add_option("-d", "--display",
action="store", type="string", dest="display",
help="xpath expression indicating what nodes to be displayed from matched and screened patterns", metavar="XPATH")
parser.add_option("-n", "--ns",
action="append", type="string", dest="ns",
help="prefix to namespace mapping", metavar="<PREFIX=URI>")
parser.add_option("--sentinel",
action="store", type="string", dest="sentinel",
help="xpath expression to be checked for each pattern match. If true it causes the reporting to stop, with no further parsing", metavar="XPATH")
#parser.add_option("-q", "--quiet",
# action="store_false", dest="verbose", default=1,
# help="don't print status messages to stdout")
return parser
def main(argv=None):
#Ideas borrowed from
# http://www.artima.com/forums/flat.jsp?forum=106&thread=4829
#But with better integration of entry points
if argv is None:
argv = sys.argv
# By default, optparse usage errors are terminated by SystemExit
try:
optparser = command_line_prep()
options, args = optparser.parse_args(argv[1:])
# Process mandatory arguments with IndexError try...except blocks
try:
source = args[0]
except IndexError:
optparser.error("Missing filename/URL to parse")
try:
xpattern = args[1]
except IndexError:
optparser.error("Missing main xpattern")
except SystemExit, status:
return status
# Perform additional setup work here before dispatching to run()
# Detectable errors encountered here should be handled and a status
# code of 1 should be returned. Note, this would be the default code
# for a SystemExit exception with a string message.
try:
xpath = args[2].decode('utf-8')
except IndexError:
xpath = None
xpattern = xpattern.decode('utf-8')
sentinel = options.sentinel and options.sentinel.decode('utf-8')
display = options.display and options.display.decode('utf-8')
prefixes = options.ns
limit = options.limit
if source == '-':
source = sys.stdin
run(source, xpattern, xpath, limit, sentinel, display, prefixes)
if __name__ == "__main__":
sys.exit(main(sys.argv))
|