File: ergane2tei.py

package info (click to toggle)
serpento 0.3.6
  • links: PTS
  • area: main
  • in suites: woody
  • size: 292 kB
  • ctags: 381
  • sloc: python: 1,644; ansic: 666; perl: 157; sh: 116; makefile: 72
file content (106 lines) | stat: -rwxr-xr-x 2,350 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python

# convert output extracted from ergane into .tei format
# output should already be in utf-8
# you should replace '@' in resulting tei with spaces
# experanto translations are preserved
# THIS IS DOG SLOW because the esperanto translations
# are converted from post-h into utf8 with konwert


import sys, string, popen2, kjbuckets

def konwert(s, fr, to, par=""):
    if fr==to:
        return s
    pout, pin = popen2.popen2("konwert %s-%s%s" % (fr, to, par))
    pin.write(s)
    pin.close()
    ns = pout.read()
    pout.close()
    return ns


f = open(sys.argv[1], "r")



def teiheader():
    print """<!DOCTYPE TEI.2       PUBLIC "-//TEI P3//DTD Main Document Type//EN"  [
   <!ENTITY % TEI.dictionaries "INCLUDE" >
]>

<tei.2>
  <teiHeader>
    <filedesc>
      <titlestmt>
        <title>           </title>
      </titlestmt>
      <publicationstmt>
        <authority>Freedict.de</authority>
      </publicationstmt>
      <sourcedesc>
        <p>http://www.freedict.de</p>
      </sourcedesc>
    </filedesc>
  </teiHeader>
"""    

words = {}

block = ""

teiheader()
print "<text>"
print "<body>"

lines = f.readlines()

i=0

while 1:
    #i = readblock(f) # or f.readline
    l = lines[i]
    assert l[0]<>' '
    header = string.strip(l)
    trans = []
    while 1:
        i = i+1
        if i>=len(lines):
            break
        ll = lines[i]
        if ll[0]<>' ':
            break
        ll = string.strip(ll[4:])
        a = string.find(ll, '(')
        if a>0:
            b = string.rfind(ll, ')')
            esp = ll[a+1:b]
            ll = string.strip(ll[:a]+ll[b+1:])
            ### komment this out and you will get rid of esperanto translations
            ### it will be much faster, too
            esp = konwert(esp, 'hmetodo', 'UTF8')
            esp = string.replace(esp, '@', 'ux')
            esp = konwert(esp, 'xmetodo', 'UTF8')
            ###
        trans.append("%s (%s)" % (ll, esp))
        #trans.append(ll)
    trans = kjbuckets.kjSet(trans).items()
                
    print "<entry>"
    print "  <form>"
    print "    <orth>"+header+"</orth>"
    print "  </form>"
    print "  <trans>"
    for k in trans:
        print "    <tr>"+k+"</tr>"
    print "  </trans>"
    print "</entry>"

    if i>=len(lines):
        break

print "</body>"
print "</text>"

print "</tei.2>"