File: trimxml.py

package info (click to toggle)
amara 1.2a2-1.1
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 796 kB
  • ctags: 876
  • sloc: python: 8,650; xml: 1,450; makefile: 8; sh: 4
file content (225 lines) | stat: -rw-r--r-- 7,374 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python
"""
A command line tool for running reports on XML files.

trimxslt allows you to rapidly extract details from large XML files
on the command line.

Run "trimxslt --help" for details of the command line parameters, but
here are some pointers to get you started.

Let's say you have a simple database dump format with the following
form:

<db>
  <record id="1">
    <name>Alex</name>
    <address>123 Maple St.</address>
  </record>
  <record id="2">
    <name>Bob</name>
    <address>456 Birch Rd.</address>
  </record>
  <record id="3">
    <name>Chris</name>
    <address>789 Pine St.</address>
  </record>
</db>

You can:

Get all the full contents of name elements

$ trimxml file.xml name
<name>Alex</name>
<name>Bob</name>
<name>Chris</name>

Get the full contents of the record with ID 2

$ trimxml file.xml record "@id='2'"
<record id="2">
    <name>Bob</name>
    <address>456 Birch Rd.</address>
  </record>

Get the full contents of the first two name elements

$ trimxml -c 2 file.xml name
<name>Alex</name>
<name>Bob</name>

Get the name of the record with ID 2

$ trimxml -d "name" file.xml record "@id='2'"
<name>Bob</name>

You could display the id and each correspoding name as follows:

$ trimxml file.xml "@id|name"
1
<name>Alex</name>
2
<name>Bob</name>
3
<name>Chris</name>

Or a more precise approach might be (demonstrating the use of XPath functions):

$ trimxml -d "concat(@id, ': ', name)" file.xml record
1: Alex
2: Bob
3: Chris

trimxml uses namespaces declared on the document element, so you can
conveniently make queries without needing to separately declare prefixes.
So to get the URLs of all a links in an XHTML document you could do:

trimxml -d "@href" file.xhtml "html:a"

As long as there is a namespace declaration
xmlns:ht="http://www.w3.org/1999/xhtml" in the document.  If not
(many XHTML documents use the default namespace, which courtesy XPath 1.0
restrictions prevents trimxml from doing any guesswork for you) you have
to declare the prefix.

trimxml --ns=ht="http://www.w3.org/1999/xhtml" -d "@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "ht:a"

Notice how this example loads the source XML (XHTML) from a Web URL rather than a local file.  Of course, a shortcut for this is simply:

trimxml http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"

"""
#The following won't work because EXSLT is only supported in XsltContext and we use Ft.Xml.XPath.Context
#We can probably revisit when we make bindery nodes subclasses of Domlette
#trimxml --ns=str="http://exslt.org/strings" -d "str:replace(., 'http://', '')" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml "@href"

import os
import re
import sys
import codecs
import optparse
#import cStringIO
import amara
from amara import saxtools
from xml.dom import Node

#from xml.dom import EMPTY_NAMESPACE as NULL_NAMESPACE
#from xml.dom import EMPTY_PREFIX as NULL_PREFIX


#FIXME: Use 4Suite L10N
def _(t): return t


def run(source, xpattern, xpath, limit, sentinel, display, prefixes):
    prefixes = prefixes or {}
    try:
        prefixes = dict([ p.split('=') for p in prefixes ])
    except ValueError:
        raise ValueError("Invalid prefix declaration")
    if hasattr(source, 'read'):
        if hasattr(source, 'rewind'):
            nss = saxtools.sniff_namespace(source)
            source.rewind()
        else:
            source = source.read()
            nss = saxtools.sniff_namespace(source)
    else:
        nss = saxtools.sniff_namespace(source)
    nss.update(prefixes)
    nodes = amara.pushbind(source, xpattern, prefixes=nss)
    count = 0
    for node in nodes:
        if not xpath or node.xml_xpath(xpath):
            count += 1
            if display:
                #Print specified subset
                result = node.xml_xpath(display)
                if isinstance(result, list):
                    print '\n'.join([ n.nodeType == Node.ATTRIBUTE_NODE and n.nodeValue or n.xml() for n in result ])
                else:
                    print result
            else:
                #Print the whole thing
                try:
                    print node.xml()
                except AttributeError:
                    print unicode(node).encode('utf-8')
            if limit != -1 and count >= limit:
                break
        if sentinel and node.xml_xpath(sentinel):
            break
    return


class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg


def command_line_prep():
    from optparse import OptionParser
    usage = "%prog [options] source xpattern [xpath]"
    parser = OptionParser(usage=usage)
    parser.add_option("-c", "--limit",
                      action="store", type="int", dest="limit", default=-1,
                      help="limit the number of xpattern matches retrieved; files will not be parsed beyond this number, so it serves as optimization", metavar="NUMBER")
    parser.add_option("-d", "--display",
                      action="store", type="string", dest="display",
                      help="xpath expression indicating what nodes to be displayed from matched and screened patterns", metavar="XPATH")
    parser.add_option("-n", "--ns",
                      action="append", type="string", dest="ns",
                      help="prefix to namespace mapping", metavar="<PREFIX=URI>")
    parser.add_option("--sentinel",
                      action="store", type="string", dest="sentinel",
                      help="xpath expression to be checked for each pattern match.  If true it causes the   reporting to stop, with no further parsing", metavar="XPATH")
    #parser.add_option("-q", "--quiet",
    #                  action="store_false", dest="verbose", default=1,
    #                  help="don't print status messages to stdout")
    return parser


def main(argv=None):
    #Ideas borrowed from
    # http://www.artima.com/forums/flat.jsp?forum=106&thread=4829
    #But with better integration of entry points
    if argv is None:
        argv = sys.argv
    # By default, optparse usage errors are terminated by SystemExit
    try:
        optparser = command_line_prep()
        options, args = optparser.parse_args(argv[1:])
        # Process mandatory arguments with IndexError try...except blocks
        try:
            source = args[0]
        except IndexError:
            optparser.error("Missing filename/URL to parse")
        try:
            xpattern = args[1]
        except IndexError:
            optparser.error("Missing main xpattern")
    except SystemExit, status:
        return status

    # Perform additional setup work here before dispatching to run()
    # Detectable errors encountered here should be handled and a status
    # code of 1 should be returned. Note, this would be the default code
    # for a SystemExit exception with a string message.
    try:
        xpath = args[2].decode('utf-8')
    except IndexError:
        xpath = None
    xpattern = xpattern.decode('utf-8')
    sentinel = options.sentinel and options.sentinel.decode('utf-8')
    display = options.display and options.display.decode('utf-8')
    prefixes = options.ns
    limit = options.limit
    if source == '-':
        source = sys.stdin
    run(source, xpattern, xpath, limit, sentinel, display, prefixes)


if __name__ == "__main__":
    sys.exit(main(sys.argv))