1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
|
#!/usr/bin/python
# convert output extracted from ergane into .tei format
# output should already be in utf-8
# you should replace '@' in resulting tei with spaces
# experanto translations are preserved
# THIS IS DOG SLOW because the esperanto translations
# are converted from post-h into utf8 with konwert
import sys, string, popen2, kjbuckets
def konwert(s, fr, to, par=""):
if fr==to:
return s
pout, pin = popen2.popen2("konwert %s-%s%s" % (fr, to, par))
pin.write(s)
pin.close()
ns = pout.read()
pout.close()
return ns
f = open(sys.argv[1], "r")
def teiheader():
print """<!DOCTYPE TEI.2 PUBLIC "-//TEI P3//DTD Main Document Type//EN" [
<!ENTITY % TEI.dictionaries "INCLUDE" >
]>
<tei.2>
<teiHeader>
<filedesc>
<titlestmt>
<title> </title>
</titlestmt>
<publicationstmt>
<authority>Freedict.de</authority>
</publicationstmt>
<sourcedesc>
<p>http://www.freedict.de</p>
</sourcedesc>
</filedesc>
</teiHeader>
"""
words = {}
block = ""
teiheader()
print "<text>"
print "<body>"
lines = f.readlines()
i=0
while 1:
#i = readblock(f) # or f.readline
l = lines[i]
assert l[0]<>' '
header = string.strip(l)
trans = []
while 1:
i = i+1
if i>=len(lines):
break
ll = lines[i]
if ll[0]<>' ':
break
ll = string.strip(ll[4:])
a = string.find(ll, '(')
if a>0:
b = string.rfind(ll, ')')
esp = ll[a+1:b]
ll = string.strip(ll[:a]+ll[b+1:])
### komment this out and you will get rid of esperanto translations
### it will be much faster, too
esp = konwert(esp, 'hmetodo', 'UTF8')
esp = string.replace(esp, '@', 'ux')
esp = konwert(esp, 'xmetodo', 'UTF8')
###
trans.append("%s (%s)" % (ll, esp))
#trans.append(ll)
trans = kjbuckets.kjSet(trans).items()
print "<entry>"
print " <form>"
print " <orth>"+header+"</orth>"
print " </form>"
print " <trans>"
for k in trans:
print " <tr>"+k+"</tr>"
print " </trans>"
print "</entry>"
if i>=len(lines):
break
print "</body>"
print "</text>"
print "</tei.2>"
|