File: trimxml.py

package info (click to toggle)
amara 1.2a2-1.1
links: PTS
area: main
in suites: squeeze
size: 796 kB
ctags: 876
sloc: python: 8,650; xml: 1,450; makefile: 8; sh: 4
file content (225 lines) | stat: -rw-r--r-- 7,374 bytes
parent folder | download | duplicates (2)
#!/usr/bin/env python
"""
A command line tool for running reports on XML files.

trimxslt allows you to rapidly extract details from large XML files
on the command line.

Run "trimxslt --help" for details of the command line parameters, but
here are some pointers to get you started.

Let's say you have a simple database dump format with the following
form:

<db>
  <record id="1">
    <name>Alex</name>
    <address>123 Maple St.</address>
  </record>
  <record id="2">
    <name>Bob</name>
    <address>456 Birch Rd.</address>
  </record>
  <record id="3">
    <name>Chris</name>
    <address>789 Pine St.</address>
  </record>
</db>

You can:

Get all the full contents of name elements

$ trimxml file.xml name
<name>Alex</name>
<name>Bob</name>
<name>Chris</name>

Get the full contents of the record with ID 2

$ trimxml file.xml record "@id='2'"
<record id="2">
    <name>Bob</name>
    <address>456 Birch Rd.</address>
  </record>

Get the full contents of the first two name elements

$ trimxml -c 2 file.xml name
<name>Alex</name>
<name>Bob</name>

Get the name of the record with ID 2

$ trimxml -d "name" file.xml record "@id='2'"
<name>Bob</name>

You could display the id and each correspoding name as follows:

$ trimxml file.xml "@id|name"
1
<name>Alex</name>
2
<name>Bob</name>
3
<name>Chris</name>

Or a more precise approach might be (demonstrating the use of XPath functions):

$ trimxml -d "concat(@id, ': ', name)" file.xml record
1: Alex
2: Bob
3: Chris

trimxml uses namespaces declared on the document element, so you can
conveniently make queries without needing to separately declare prefixes.
So to get the URLs of all a links in an XHTML document you could do:

trimxml -d "@href" file.xhtml "html:a"

As long as there is a namespace declaration
xmlns:ht="http://www.w3.org/1999/xhtml" in the document.  If not
(many XHTML documents use the default namespace, which courtesy XPath 1.0
restrictions prevents trimxml from doing any guesswork for you) you have
to declare the prefix.

trimxml --ns=ht="http://www.w3.org/1999/xhtml" -d "@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "ht:a"

Notice how this example loads the source XML (XHTML) from a Web URL rather than a local file.  Of course, a shortcut for this is simply:

trimxml http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"

"""
#The following won't work because EXSLT is only supported in XsltContext and we use Ft.Xml.XPath.Context
#We can probably revisit when we make bindery nodes subclasses of Domlette
#trimxml --ns=str="http://exslt.org/strings" -d "str:replace(., 'http://', '')" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"

import os
import re
import sys
import codecs
import optparse
#import cStringIO
import amara
from amara import saxtools
from xml.dom import Node

#from xml.dom import EMPTY_NAMESPACE as NULL_NAMESPACE
#from xml.dom import EMPTY_PREFIX as NULL_PREFIX


#FIXME: Use 4Suite L10N
def _(t): return t


def run(source, xpattern, xpath, limit, sentinel, display, prefixes):
    prefixes = prefixes or {}
    try:
        prefixes = dict([ p.split('=') for p in prefixes ])
    except ValueError:
        raise ValueError("Invalid prefix declaration")
    if hasattr(source, 'read'):
        if hasattr(source, 'rewind'):
            nss = saxtools.sniff_namespace(source)
            source.rewind()
        else:
            source = source.read()
            nss = saxtools.sniff_namespace(source)
    else:
        nss = saxtools.sniff_namespace(source)
    nss.update(prefixes)
    nodes = amara.pushbind(source, xpattern, prefixes=nss)
    count = 0
    for node in nodes:
        if not xpath or node.xml_xpath(xpath):
            count += 1
            if display:
                #Print specified subset
                result = node.xml_xpath(display)
                if isinstance(result, list):
                    print '\n'.join([ n.nodeType == Node.ATTRIBUTE_NODE and n.nodeValue or n.xml() for n in result ])
                else:
                    print result
            else:
                #Print the whole thing
                try:
                    print node.xml()
                except AttributeError:
                    print unicode(node).encode('utf-8')
            if limit != -1 and count >= limit:
                break
        if sentinel and node.xml_xpath(sentinel):
            break
    return


class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg


def command_line_prep():
    from optparse import OptionParser
    usage = "%prog [options] source xpattern [xpath]"
    parser = OptionParser(usage=usage)
    parser.add_option("-c", "--limit",
                      action="store", type="int", dest="limit", default=-1,
                      help="limit the number of xpattern matches retrieved; files will not be parsed beyond this number, so it serves as optimization", metavar="NUMBER")
    parser.add_option("-d", "--display",
                      action="store", type="string", dest="display",
                      help="xpath expression indicating what nodes to be displayed from matched and screened patterns", metavar="XPATH")
    parser.add_option("-n", "--ns",
                      action="append", type="string", dest="ns",
                      help="prefix to namespace mapping", metavar="<PREFIX=URI>")
    parser.add_option("--sentinel",
                      action="store", type="string", dest="sentinel",
                      help="xpath expression to be checked for each pattern match.  If true it causes the   reporting to stop, with no further parsing", metavar="XPATH")
    #parser.add_option("-q", "--quiet",
    #                  action="store_false", dest="verbose", default=1,
    #                  help="don't print status messages to stdout")
    return parser


def main(argv=None):
    #Ideas borrowed from
    # http://www.artima.com/forums/flat.jsp?forum=106&thread=4829
    #But with better integration of entry points
    if argv is None:
        argv = sys.argv
    # By default, optparse usage errors are terminated by SystemExit
    try:
        optparser = command_line_prep()
        options, args = optparser.parse_args(argv[1:])
        # Process mandatory arguments with IndexError try...except blocks
        try:
            source = args[0]
        except IndexError:
            optparser.error("Missing filename/URL to parse")
        try:
            xpattern = args[1]
        except IndexError:
            optparser.error("Missing main xpattern")
    except SystemExit, status:
        return status

    # Perform additional setup work here before dispatching to run()
    # Detectable errors encountered here should be handled and a status
    # code of 1 should be returned. Note, this would be the default code
    # for a SystemExit exception with a string message.
    try:
        xpath = args[2].decode('utf-8')
    except IndexError:
        xpath = None
    xpattern = xpattern.decode('utf-8')
    sentinel = options.sentinel and options.sentinel.decode('utf-8')
    display = options.display and options.display.decode('utf-8')
    prefixes = options.ns
    limit = options.limit
    if source == '-':
        source = sys.stdin
    run(source, xpattern, xpath, limit, sentinel, display, prefixes)


if __name__ == "__main__":
    sys.exit(main(sys.argv))