File: ngram-count-patterns.py

package info (click to toggle)
apertium-lex-tools 0.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,212 kB
  • sloc: python: 5,788; cpp: 3,086; xml: 395; makefile: 86; awk: 63; sh: 61
file content (196 lines) | stat: -rwxr-xr-x 7,980 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/python3
# coding=utf-8
# -*- encoding: utf-8 -*-

import sys
import common

# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations

# sl_tl[sl_word][tl_word] = tl_freq

# Then we want to make a list of n-grams around the source words, with which target word they want, and the freq.

# ngrams[ngram][tl_word] = freq

# 5 	Please<vblex><inf> rise<n> ,<cm> then<adv> ,<cm> for<pr> this<det><dem> minute<n> 's<gen> silence<n> .<sent>
# 5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
# 5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
# 5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
# -------------------------------------------------------------------------------

def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules):
    MAX_NGRAMS = 2
    cur_line = 0

    sl_tl_defaults = {}
    sl_tl = {}
    ngrams = {}

    lineno = 0
    for line in open(freq_lexicon).readlines():
        lineno += 1
        if lineno % 10000 == 0:
            print(lineno, file=sys.stderr)
        if len(line) < 1:
            continue

        row = common.tokenise_tagger_line(line)
        sl = common.wrap(row[0])
        tl = common.wrap(row[1])
        if tl[1] == '*':
            tl = tl[:-3] + '$'
        if line.count('@') > 0:
            sl_tl_defaults[sl] = tl
        else:
            sl_tl[sl] = tl

    cur_sl_row = []
    cur_tl_row = []
    cur_bt_row = []
    cur_al_row = []
    lineno = 0
    for line in open(candidates).readlines():
        lineno += 1
        line = line.strip()
        if lineno % 500 == 0:
            print(lineno, file=sys.stderr)
        if line[0] == '-':
            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row)
            #		print cur_sl_row
            #		print cur_bt_row
            #		print cur_tl_row
            #		print cur_al_row
            #
            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
            #
            # sl_tl[sl_word][tl_word] = tl_freq
            i = 0
            for slword in cur_sl_row:
                if len(cur_bt_row[i]['tls']) > 1:
                    for al in cur_al_row:
                        if al == '':
                            continue
                        al_sl = int(al.split('-')[1])
                        al_tl = int(al.split('-')[0])
                        if al_sl != i:
                            continue

                        tlword = common.wrap(cur_tl_row[al_tl])
                        slword = common.wrap(slword)

                        if slword not in sl_tl_defaults:
                            print('!', file=sys.stderr)
                            continue

                        for j in range(1, MAX_NGRAMS):
                            pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1]))
                            postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1]))
                            roundgram = ' '.join(
                                map(common.wrap, cur_sl_row[i-j:i+j+1]))

                            if slword not in ngrams:
                                ngrams[slword] = {}

                            if pregram not in ngrams[slword]:
                                ngrams[slword][pregram] = {}

                            if postgram not in ngrams[slword]:
                                ngrams[slword][postgram] = {}

                            if roundgram not in ngrams[slword]:
                                ngrams[slword][roundgram] = {}

                            if tlword not in ngrams[slword][pregram]:
                                ngrams[slword][pregram][tlword] = 0

                            if tlword not in ngrams[slword][postgram]:
                                ngrams[slword][postgram][tlword] = 0

                            if tlword not in ngrams[slword][roundgram]:
                                ngrams[slword][roundgram][tlword] = 0

                            ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
                            ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
                            ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1

                i = i + 1

            cur_line = 0
            # print line
            continue

        line = line.split('\t')[1]

        if cur_line == 0:
            cur_sl_row = common.tokenise_tagger_line(line)
        elif cur_line == 1:
            cur_bt_row = common.tokenise_biltrans_line(line)
        elif cur_line == 2:
            cur_tl_row = common.tokenise_tagger_line(line)
        elif cur_line == 3:
            cur_al_row = line.split(' ')

        cur_line = cur_line + 1

    for sl in ngrams:
        for ngram in ngrams[sl]:
            total = 0
            max_freq = -1
            current_tl = ''
            newtl = sorted(ngrams[sl][ngram], key=lambda x: ngrams[sl][ngram][x])
            newtl.reverse()
            newtl = newtl[:max_rules]
            for tl in newtl:
                if ngrams[sl][ngram][tl] > max_freq:
                    max_freq = ngrams[sl][ngram][tl]
                    current_tl = tl

                total = total + ngrams[sl][ngram][tl]

            # > If for each of the rules we include
            # > the amount of time the translation is seen with that pattern over the
            # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
            # > etc.  (>0.6 would be the same as 2/3 of the time the alternative
            # > translation is seen with that ngram, and 1/3 of the time the default
            # > translation is). I think this would be easier to explain than the magic
            # > number I came up with.
            #
            # I see this as a way to define how "crispy" the decisions are. I think it
            # would be better to express this as a ratio: the ratio of the times the
            # alternative translation is seen to the number of times the defaullt
            # translation is seen with that n-gram.
            #
            # It would be "2" in this case: the alternative is seen twice as often as
            # the default.

            for tl in newtl:
                crispiness = 0.0
                default = sl_tl_defaults[sl]
                alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
                def_crisp = 1.0
                if default in ngrams[sl][ngram]:
                    def_crisp = float(
                        ngrams[sl][ngram][default] / float(total))

                weight = float(ngrams[sl][ngram][tl]) / float(total)
                crispiness = alt_crisp/def_crisp

                # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]

                if crispiness < crisphold:
                    print('-', crispiness, weight, total, max_freq,
                          ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
                else:

                    print('+', crispiness, weight, total, max_freq,
                          ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))


if __name__ == '__main__':
    if len(sys.argv) < 5:
        print(
            'Usage: count-patterns.py <lex> <candidates> <crispiness threshold> <max_rules>', file=sys.stderr)
        exit(1)

    ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])