File: esis_builder.py

package info (click to toggle)
python-xml 0.4.19981014-1
  • links: PTS
  • area: main
  • in suites: slink
  • size: 2,124 kB
  • ctags: 3,099
  • sloc: ansic: 9,075; python: 8,150; xml: 7,940; makefile: 84; sh: 41
file content (123 lines) | stat: -rw-r--r-- 2,462 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'''DOM builder from ESIS output.'''

# Hacked from ``pygrove.py'' from Paul Prescod.

import string, re, regsub, sys
from xml.dom.core import *
from xml.dom.builder import Builder


_sdata_dict = {
	'acirc': '',
	'agrave': '',
	'ccedil': '',
	'dollar': '$',
	'eacute': '',
	'ecirc': '',
	'egrave': '',
	'icirc': '',
	'ocirc': '',
	'percnt': '%',
	'ugrave': '',
}

def handle_sdata(sdata):
	return _sdata_dict.get(sdata, 'unknown')


class EsisBuilder(Builder):
	
	def __init__(self):
		Builder.__init__(self)
		self.attr_store = {}
		#self.sdata_handler = handle_sdata

	def feed(self, data):
		for line in string.split(data, '\n'):
			if not line: 
				break
			event = line[0]
			text = line[1:]

			if event == '(':
				element = self.dom_factory.createElement(text, self.attr_store)
				self.attr_store = {}
				self.push(element)

			elif event == ')':
				self.pop()

			elif event == 'A':
				l = re.split(' ', text, 2)
				name = l[0]
				value = ESISDecode(l[2])
				self.attr_store[name] = value

			elif event == '-':
				text = self.dom_factory.createTextNode(ESISDecode(text))
				self.push(text)

			elif event == 'C':
				return

			else:
				sys.stderr.write('Unknow event: ' + `line` + '\n')


backslash = r"\\"
regor = "|"

find = "(" + r"\\\\" + regor \
		+ r"\\n" + regor \
		+ r"\\\|\[[^" +backslash+ "]*" + r"\]\\\|" + regor \
		+ r"\\[0-9]+" + regor \
		+ r"\\#[0-9]+" + regor \
		+ r"\\%[0-9]+" + regor \
		+ ")"

def fix(text):
	if (not text) or (text[0] != "\\"):
		return text
	text = text[1:]
	if (text == "\\"):
		return "\\"
	elif (text == "n"):
		return "\n"
	elif(text[0] == "|"):
		return handle_sdata(string.strip(text[2:-3]))
		#return '&' + string.strip(text[2:-3]) + ';'
	elif(text[0] == "#"):
		return chr(string.atoi(text[1:],10))
	elif(text[0] == "%"):
		return chr(string.atoi(text[1:],10))
	else:
		return chr(string.atoi(text[0:3], 8)) + text[3:]

def ESISEncode(text):
	return re.sub("\n", "\\n", text);

def ESISDecode(text):
	#return regsub.gsub("\\\\n", "\n", text)

	#print `text`
	prog = re.compile(find)
	parts = prog.split(text, find)
	#print parts
	parts = map(fix, parts)
	res = string.join(parts, "");
	#print res
	return res


if __name__ == '__main__':
	import sys
	from xml.dom.writer import XmlLineariser

	p = EsisBuilder()
	p.feed(open(sys.argv[1]).read())

	w = XmlLineariser()
	w.add_newline_after = [ 'p', 'title', 'abstract' ]
	print w.linearise(p.document.documentElement)

# vim:ts=2:ai