File: wordalign.py

package info (click to toggle)
python-pynlpl 1.2.9-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,900 kB
  • sloc: python: 25,677; sh: 73; makefile: 3
file content (82 lines) | stat: -rw-r--r-- 3,221 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from pynlpl.statistics import FrequencyList, Distribution


class WordAlignment(object):

    def __init__(self, casesensitive = False):
        self.casesensitive = casesensitive

    def train(self, sourcefile, targetfile):
        sourcefile = open(sourcefile)
        targetfile = open(targetfile)

        self.sourcefreqlist = FrequencyList(None, self.casesensitive)
        self.targetfreqlist = FrequencyList(None, self.casesensitive)

        #frequency lists
        self.source2target = {}
        self.target2source = {}

        for sourceline, targetline in zip(sourcefile, targetfile):
            sourcetokens = sourceline.split()
            targettokens = targetline.split()

            self.sourcefreqlist.append(sourcetokens)
            self.targetfreqlist.append(targettokens)

            for sourcetoken in sourcetokens:
                if not sourcetoken in self.source2target:
                    self.source2target[sourcetoken] = FrequencyList(targettokens,self.casesensitive)
                else:
                    self.source2target[sourcetoken].append(targettokens)

            for targettoken in targettokens:
                if not targettoken in self.target2source:
                    self.target2source[targettoken] = FrequencyList(sourcetokens,self.casesensitive)
                else:
                    self.target2source[targettoken].append(sourcetokens)

        sourcefile.close()
        targetfile.close()

    def test(self, sourcefile, targetfile):
        sourcefile = open(sourcefile)
        targetfile = open(targetfile)


        #stage 2
        for sourceline, targetline in zip(sourcefile, targetfile):
            sourcetokens = sourceline.split()
            targettokens = targetline.split()

            S2Talignment = []
            T2Salignment = []

            for sourcetoken in sourcetokens:
                #which of the target-tokens is most frequent?
                besttoken = None
                bestscore = -1
                for i, targettoken in enumerate(targettokens):
                    if targettoken in self.source2target[sourcetoken]:
                        score = self.source2target[sourcetoken][targettoken] / float(self.targetfreqlist[targettoken])
                        if score > bestscore:
                            bestscore = self.source2target[sourcetoken][targettoken]
                            besttoken = i
                S2Talignment.append(besttoken) #TODO: multi-alignment?

            for targettoken in targettokens:
                besttoken = None
                bestscore = -1
                for i, sourcetoken in enumerate(sourcetokens):
                    if sourcetoken in self.target2source[targettoken]:
                        score = self.target2source[targettoken][sourcetoken] / float(self.sourcefreqlist[sourcetoken])
                        if score > bestscore:
                            bestscore = self.target2source[targettoken][sourcetoken]
                            besttoken = i
                T2Salignment.append(besttoken) #TODO: multi-alignment?

            yield sourcetokens, targettokens, S2Talignment, T2Salignment

        sourcefile.close()
        targetfile.close()