File: tei2dictf.py

package info (click to toggle)
serpento 0.3.6
  • links: PTS
  • area: main
  • in suites: woody
  • size: 292 kB
  • ctags: 381
  • sloc: python: 1,644; ansic: 666; perl: 157; sh: 116; makefile: 72
file content (112 lines) | stat: -rwxr-xr-x 2,711 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python

# convert filename.tei into filename.dict and filename.index
# you can sort filename.pyindex afterwards to achieve faster serpento startup
# usage: tei2dict.py filename.tei
# this will create filename.dict and filename.index files in current directory

import sys, string, sgmllib, os

from utils import decb64

seporth = ", "  # use this to separate ORTH entries (if there are multiple ones)

septrans = "\n  " # use this to separate TR entries


teifile = sys.argv[1]

basename, ext = os.path.splitext(teifile)
assert ext=='.tei'
dictfile = basename+".dict"
dictfile = open(dictfile, "w")
indexfile = basename+".index"
indexfile = open(indexfile, "w")

def process_entry(orths, trs):
    start = dictfile.tell()
    dictfile.write(string.join(orths, seporth)+"\n")
    #dictfile.write("  ")
    for i in trs:
        nlw = ''
        ll = string.split(string.strip(i), ' ')
        dictfile.write(" * ")
        first = 1 
        for j in ll:
            n = string.replace(j, '&lt;', '<')
            n = string.replace(n, '&gt;', '>')
            n = string.replace(n, '&amp;', '&')
            nlw = nlw+n+' '
            if len(nlw)>80:
                if first:
                    sp = ''
                    first = 0
                else:
                    sp = '   '
                dictfile.write(sp+nlw+'\n')
                nlw =  ''
        if nlw:
            if first:
                sp = ''
                first = 0
            else:
                sp = '   '
            dictfile.write(sp+nlw+'\n')
        else:
            dictfile.write("\n")
    end = dictfile.tell()
    ln = end-start
    for i in orths:
        indexfile.write("%s\t%s\t%s\n" % (i, decb64(start), decb64(ln)))


class Parser(sgmllib.SGMLParser):

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.inorth = self.intr = 0
    
    def start_entry(self, a):
        pass
    def end_entry(self):
        process_entry(self.header, self.translations)

    def start_form(self, a):
        self.header = []
    def end_form(self):
        pass

    def start_orth(self, a):
        self.inorth = 1
    def end_orth(self):
        self.inorth = 0

    def start_tr(self, a):
        self.intr = 1
    def end_tr(self):
        self.intr = 0
        
    def start_trans(self, a):
        self.translations = []
    def end_trans(self):
        pass
        
    def handle_data(self, d):
        if self.inorth:
            self.header.append(d)
        elif self.intr:
            self.translations.append(d)
        

p = Parser()

f = open(teifile)
while 1:
    l = f.readline()
    if not l:
        break
    p.feed(l)
f.close()
dictfile.close()
indexfile.close()