File: ding2tei.py

package info (click to toggle)
serpento 0.3.6
  • links: PTS
  • area: main
  • in suites: woody
  • size: 292 kB
  • ctags: 381
  • sloc: python: 1,644; ansic: 666; perl: 157; sh: 116; makefile: 72
file content (95 lines) | stat: -rwxr-xr-x 1,992 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python

# convert ding-like dictionary into .tei format
# dictionary consists of lines, each line is one entry and looks like this:
#   first : second
# where first is a word in first language, second is translation
# : is separator, can be changed below (sep)
# there can be more words as entries:
#   first : second1, second2, second3
# separated by , (see sep1 below)

sep = " :: "
sep1 = ", " # set this to some nonsense if you have one phrase per line

import sys, string

f = open(sys.argv[1], "r")

pos = 0
lastlinepos = 0

inword = 0
word = ""


def teiheader():
    print """<!DOCTYPE TEI.2       PUBLIC "-//TEI P3//DTD Main Document Type//EN"  [
   <!ENTITY % TEI.dictionaries "INCLUDE" >
]>

<tei.2>
  <teiHeader>
    <filedesc>
      <titlestmt>
        <title>           </title>
      </titlestmt>
      <publicationstmt>
        <authority>Freedict.de</authority>
      </publicationstmt>
      <sourcedesc>
        <p>http://www.freedict.de</p>
      </sourcedesc>
    </filedesc>
  </teiHeader>
"""    

words = {}

block = ""

teiheader()
print "<text>"
print "<body>"

while 1:
    #i = readblock(f) # or f.readline
    i = f.readline()
    if not i:
        break
    endpos = pos+len(i)
    i = string.replace(i, '\n', '')
    spl = string.split(i, sep)
    one, two = spl[:2]
    if len(spl)==3:
        clen="[%s]" % spl[2]
    else:
        clen=""
    # uncomment this line to change order of languages
    one, two = two, one
    if clen:
        two = clen+" "+two
    two = [two] # or:
    #two = string.split(two, sep1)
    
    if words.has_key(one):
        words[one].extend(two)
    else:
        words[one] = two

for i, j in words.items():
    print "<entry>"
    print "  <form>"
    for k in string.split(i, sep1):
        print "    <orth>"+k+"</orth>"
    print "  </form>"
    print "  <trans>"
    for k in j:
        print "    <tr>"+k+"</tr>"
    print "  </trans>"
    print "</entry>"

print "</body>"
print "</text>"

print "</tei.2>"