1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
|
'''DOM builder from ESIS output.'''
# Hacked from ``pygrove.py'' from Paul Prescod.
import string, re, regsub, sys
from xml.dom.core import *
from xml.dom.builder import Builder
_sdata_dict = {
'acirc': '',
'agrave': '',
'ccedil': '',
'dollar': '$',
'eacute': '',
'ecirc': '',
'egrave': '',
'icirc': '',
'ocirc': '',
'percnt': '%',
'ugrave': '',
}
def handle_sdata(sdata):
return _sdata_dict.get(sdata, 'unknown')
class EsisBuilder(Builder):
def __init__(self):
Builder.__init__(self)
self.attr_store = {}
#self.sdata_handler = handle_sdata
def feed(self, data):
for line in string.split(data, '\n'):
if not line:
break
event = line[0]
text = line[1:]
if event == '(':
element = self.dom_factory.createElement(text, self.attr_store)
self.attr_store = {}
self.push(element)
elif event == ')':
self.pop()
elif event == 'A':
l = re.split(' ', text, 2)
name = l[0]
value = ESISDecode(l[2])
self.attr_store[name] = value
elif event == '-':
text = self.dom_factory.createTextNode(ESISDecode(text))
self.push(text)
elif event == 'C':
return
else:
sys.stderr.write('Unknow event: ' + `line` + '\n')
backslash = r"\\"
regor = "|"
find = "(" + r"\\\\" + regor \
+ r"\\n" + regor \
+ r"\\\|\[[^" +backslash+ "]*" + r"\]\\\|" + regor \
+ r"\\[0-9]+" + regor \
+ r"\\#[0-9]+" + regor \
+ r"\\%[0-9]+" + regor \
+ ")"
def fix(text):
if (not text) or (text[0] != "\\"):
return text
text = text[1:]
if (text == "\\"):
return "\\"
elif (text == "n"):
return "\n"
elif(text[0] == "|"):
return handle_sdata(string.strip(text[2:-3]))
#return '&' + string.strip(text[2:-3]) + ';'
elif(text[0] == "#"):
return chr(string.atoi(text[1:],10))
elif(text[0] == "%"):
return chr(string.atoi(text[1:],10))
else:
return chr(string.atoi(text[0:3], 8)) + text[3:]
def ESISEncode(text):
return re.sub("\n", "\\n", text);
def ESISDecode(text):
#return regsub.gsub("\\\\n", "\n", text)
#print `text`
prog = re.compile(find)
parts = prog.split(text, find)
#print parts
parts = map(fix, parts)
res = string.join(parts, "");
#print res
return res
if __name__ == '__main__':
import sys
from xml.dom.writer import XmlLineariser
p = EsisBuilder()
p.feed(open(sys.argv[1]).read())
w = XmlLineariser()
w.add_newline_after = [ 'p', 'title', 'abstract' ]
print w.linearise(p.document.documentElement)
# vim:ts=2:ai
|