File: tei2dict.py

package info (click to toggle)
serpento 0.3.6
  • links: PTS
  • area: main
  • in suites: woody
  • size: 292 kB
  • ctags: 381
  • sloc: python: 1,644; ansic: 666; perl: 157; sh: 116; makefile: 72
file content (85 lines) | stat: -rwxr-xr-x 1,964 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python

# convert filename.tei into filename.dict and filename.index
# you can sort filename.pyindex afterwards to achieve faster serpento startup
# usage: tei2dict.py filename.tei
# this will create filename.dict and filename.index files in current directory

import sys, string, sgmllib, os

from utils import decb64

seporth = ", "  # use this to separate ORTH entries (if there are multiple ones)

septrans = "\n  " # use this to separate TR entries


teifile = sys.argv[1]

basename, ext = os.path.splitext(teifile)
assert ext=='.tei'
dictfile = basename+".dict"
dictfile = open(dictfile, "w")
indexfile = basename+".index"
indexfile = open(indexfile, "w")

def process_entry(orths, trs):
    start = dictfile.tell()
    dictfile.write(string.join(orths, seporth)+"\n")
    dictfile.write("  "+string.join(trs, septrans)+"\n")
    end = dictfile.tell()
    ln = end-start
    for i in orths:
        indexfile.write("%s\t%s\t%s\n" % (i, decb64(start), decb64(ln)))


class Parser(sgmllib.SGMLParser):

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.inorth = self.intr = 0
    
    def start_entry(self, a):
        pass
    def end_entry(self):
        process_entry(self.header, self.translations)

    def start_form(self, a):
        self.header = []
    def end_form(self):
        pass

    def start_orth(self, a):
        self.inorth = 1
    def end_orth(self):
        self.inorth = 0

    def start_tr(self, a):
        self.intr = 1
    def end_tr(self):
        self.intr = 0
        
    def start_trans(self, a):
        self.translations = []
    def end_trans(self):
        pass
        
    def handle_data(self, d):
        if self.inorth:
            self.header.append(d)
        elif self.intr:
            self.translations.append(d)
        

p = Parser()

f = open(teifile)
while 1:
    l = f.readline()
    if not l:
        break
    p.feed(l)
f.close()
dictfile.close()
indexfile.close()