1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
|
#!/usr/bin/python
# convert filename.tei into filename.dict and filename.index
# you can sort filename.pyindex afterwards to achieve faster serpento startup
# usage: tei2dict.py filename.tei
# this will create filename.dict and filename.index files in current directory
import sys, string, sgmllib, os
from utils import decb64
seporth = ", " # use this to separate ORTH entries (if there are multiple ones)
septrans = "\n " # use this to separate TR entries
teifile = sys.argv[1]
basename, ext = os.path.splitext(teifile)
assert ext=='.tei'
dictfile = basename+".dict"
dictfile = open(dictfile, "w")
indexfile = basename+".index"
indexfile = open(indexfile, "w")
def process_entry(orths, trs):
start = dictfile.tell()
dictfile.write(string.join(orths, seporth)+"\n")
dictfile.write(" "+string.join(trs, septrans)+"\n")
end = dictfile.tell()
ln = end-start
for i in orths:
indexfile.write("%s\t%s\t%s\n" % (i, decb64(start), decb64(ln)))
class Parser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.inorth = self.intr = 0
def start_entry(self, a):
pass
def end_entry(self):
process_entry(self.header, self.translations)
def start_form(self, a):
self.header = []
def end_form(self):
pass
def start_orth(self, a):
self.inorth = 1
def end_orth(self):
self.inorth = 0
def start_tr(self, a):
self.intr = 1
def end_tr(self):
self.intr = 0
def start_trans(self, a):
self.translations = []
def end_trans(self):
pass
def handle_data(self, d):
if self.inorth:
self.header.append(d)
elif self.intr:
self.translations.append(d)
p = Parser()
f = open(teifile)
while 1:
l = f.readline()
if not l:
break
p.feed(l)
f.close()
dictfile.close()
indexfile.close()
|