File: saxhack.py

package info (click to toggle)
python-xml 0.8.4-10.1%2Blenny1
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 4,972 kB
  • ctags: 10,628
  • sloc: python: 46,730; ansic: 14,354; xml: 968; makefile: 201; sh: 20
file content (166 lines) | stat: -rw-r--r-- 3,569 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#
#
# $Id: saxhack.py,v 1.5 2001/12/30 12:17:32 loewis Exp $
#
# illustrate how a saxlib parser can interface directly to sgmlop
#
# history:
# 98-05-23 fl   created (derived from the coreXML parser)
#
# Copyright (c) 1998 by Secret Labs AB
#
# info@pythonware.com
# http://www.pythonware.com
#

from xml.sax.saxlib import HandlerBase
class DocumentHandler:#(HandlerBase):

    # SAX interface

    def startElement(self, tag, attrs):
        pass # print "start", tag

    def endElement(self, tag):
        pass # print "end", tag

    def characters(self, text, start, len):
        pass # print "data", text[start:start+len]

# --------------------------------------------------------------------
# sgmlop-based parser

from xml.parsers import sgmlop

class Parser:

    def setDocumentHandler(self, dh):

        self.parser = sgmlop.XMLParser()
        self.parser.register(dh, 1)

    def parseFile(self, file):

        parser = self.parser

        while 1:
            data = file.read(16384)
            if not data:
                break
            parser.feed(data)

        parser.close()

# --------------------------------------------------------------------
# xmllib-based parser

from xml.parsers import xmllib

class xmllibParser(xmllib.XMLParser):

    def setDocumentHandler(self, dh):

        self.characters = dh.characters
        self.unknown_starttag = dh.startElement
        self.unknown_endtag = dh.endElement

    def handle_data(self, data):
        self.characters(data, 0, len(data))

    def parseFile(self, file):

        while 1:
            data = file.read(16384)
            if not data:
                break
            self.feed(data)

        self.close()

# --------------------------------------------------------------------
# original xmllib-based parser

class slowParser(xmllib.SlowXMLParser):

    def setDocumentHandler(self, dh):

        self.characters = dh.characters
        self.unknown_starttag = dh.startElement
        self.unknown_endtag = dh.endElement

    def handle_data(self, data):
        self.characters(data, 0, len(data))

    def parseFile(self, file):

        while 1:
            data = file.read(16384)
            if not data:
                break
            self.feed(data)

        file.close()

# ====================================================================
# test stuff

import time, os, sys

if len(sys.argv) == 1:
    print 'Usage: saxhack.py <xml filename>'
    sys.exit(1)

FILE = sys.argv[1]

size = os.stat(FILE)[6]

p  = Parser()
dh = DocumentHandler()
p.setDocumentHandler(dh)

f = open(FILE)
t = time.clock()
p.parseFile(f) # dry run
t_direct = time.clock() - t
f.close()

#import sys ; sys.exit(0)

print t_direct
if t_direct == 0:
    print 'Measured time was too small; use a larger XML file'
    sys.exit(1)

print "sgmlop:", int(size / t_direct), "bytes per second"

p = xmllibParser()
#p=slowParser()
dh = DocumentHandler()
p.setDocumentHandler(dh)

f = open(FILE)
t = time.clock()
p.parseFile(f) # dry run
t_fast = time.clock() - t
f.close()

print "xmllib:", int(size / t_fast), "bytes per second"

p = slowParser()
dh = DocumentHandler()
p.setDocumentHandler(dh)

f = open(FILE)
t = time.clock()
p.parseFile(f) # dry run
t_slow = time.clock() - t
f.close()

print "slow xmllib:", int(size / t_slow), "bytes per second"

print
print "normalized timings:"
print "slow xmllib", 1.0
print "fast xmllib", round(t_fast / t_slow, 2), "(%sx)" % round(t_slow / t_fast, 1)
print "sgmlop     ", round(t_direct / t_slow, 2), "(%sx)" % round(t_slow / t_direct, 1)
print