File: transformation.py

package info (click to toggle)
simpleparse 2.1.0a1-6
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd, wheezy
  • size: 2,776 kB
  • ctags: 4,332
  • sloc: python: 7,036; ansic: 6,395; makefile: 22
file content (103 lines) | stat: -rwxr-xr-x 3,382 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""A simple example of parsing

I have no idea for whom I originally created this code,
(which was originally written for SimpleParse 1.0) nor
why they wanted it.  Oh well, such is life.

Running as a script will do some timing tests, but the
tests are rather... simplistic.

The grammar is slow parsing around 5-10% of the speed I
normally expect from SimpleParse/mxTextTools parsers.
I'm guessing it gets into lots and lots of partial parses
of the "interesting" production, and that the huge number
of reported productions slows it down.  For example,
making atom non-reporting gives a 15% speedup on my
machine.
"""

declaration = r'''
set       := (interesting/multset/plusset)+
multset   := '*',(set/atom), (set/atom)
plusset   := '+',(set/atom), (set/atom)
atom      := -[+*]

>interesting< := (example8/example7/example6/example5/example4/example3/example2/example1)
example1     := '*+',(set/atom),(set/atom),'+',(set/atom),(set/atom)
example2     := '**',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example3     := 'fsd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example4     := 'm*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example5     := 'a*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example6     := 's*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example7     := 'bdf*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example8     := 'sd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
'''
import sys, string
from simpleparse.parser import Parser
parser = Parser(declaration,'set')


class Emitter:
	def process( self, data ):
		#import pprint
		tree = self.parse( data )
		#pprint.pprint( tree )
		# wrap up the tuple 'cause TextTools uses a different format for the top-level :(
		tree = ('set',0, tree[-1], tree[1] )
		return self.emit( tree )
	def parse( self, data ):
		self.data = data
		return parser.parse( data)
	def write( self, data ):
		sys.stdout.write( data )
	def emit( self, tree ):
		'''
		return transformation for a single tuple...
		'''
		if hasattr( self, 'emit' + tree[0] ): # have explicitprocessing function
			func = getattr( self, 'emit'+tree[0] )
			return func( tree )
		else:
			if tree[3]: # children to process, things to do :)
				result = []
				### write out pre-elements
				endpos = tree[3][0][1] # start of first child
				result.append( self.data[ tree[1]:endpos] )
				### write children
				for child in tree[3]:
					result.append( self.emit( child ) )
				### write out post elements
				startpos = tree[3][-1][2] # end of last child
				result.append( self.data[ startpos: tree[2]] )
				return string.join( result, '' )
			else:
				# we're just re-emitting same text...
				return self.data[ tree[1]:tree[2]]
	def emitexample1( self, tuple ):
		'''*+AB+CD -> ++*AC*AD+*BC*BD'''
		#print 'interesting'
		#import pdb
		#pdb.set_trace()
		a,b,c,d = map( self.emit, tuple[3] )
		#print `(a,b,c,d)`,
		return '++*%s%s*%s%s+*%s%s*%s%s'%( a,c,a,d,b,c,b,d)

if __name__ == "__main__":

	testdata = [
	'''++m*++mkp+f*nkf''',
	'''*+ab+cd''',
	'''+ab+bc+de''',
	'''*ab*bc*de''',
	'''++m*++mkp+f*nkf'''*10000,
	]

	a = Emitter()
	import time, profile
	for test in testdata:
		t = time.time()
		a.parse( test )
		t = time.time()-t
		print 'total time', t, 'length', len(test)
		if t:
			print '  %s cps' % (len(test)/t)