1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
|
#!/usr/bin/python3
# coding=utf-8
# -*- encoding: utf-8 -*-
import sys
import common
# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
# sl_tl[sl_word][tl_word] = tl_freq
# Then we want to make a list of n-grams around the source words, with which target word they want, and the freq.
# ngrams[ngram][tl_word] = freq
# 5 Please<vblex><inf> rise<n> ,<cm> then<adv> ,<cm> for<pr> this<det><dem> minute<n> 's<gen> silence<n> .<sent>
# 5 Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
# 5 Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
# 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
# -------------------------------------------------------------------------------
def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules):
MAX_NGRAMS = 2
cur_line = 0
sl_tl_defaults = {}
sl_tl = {}
ngrams = {}
lineno = 0
for line in open(freq_lexicon).readlines():
lineno += 1
if lineno % 10000 == 0:
print(lineno, file=sys.stderr)
if len(line) < 1:
continue
row = common.tokenise_tagger_line(line)
sl = common.wrap(row[0])
tl = common.wrap(row[1])
if tl[1] == '*':
tl = tl[:-3] + '$'
if line.count('@') > 0:
sl_tl_defaults[sl] = tl
else:
sl_tl[sl] = tl
cur_sl_row = []
cur_tl_row = []
cur_bt_row = []
cur_al_row = []
lineno = 0
for line in open(candidates).readlines():
lineno += 1
line = line.strip()
if lineno % 500 == 0:
print(lineno, file=sys.stderr)
if line[0] == '-':
# print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row)
# print cur_sl_row
# print cur_bt_row
# print cur_tl_row
# print cur_al_row
#
# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
#
# sl_tl[sl_word][tl_word] = tl_freq
i = 0
for slword in cur_sl_row:
if len(cur_bt_row[i]['tls']) > 1:
for al in cur_al_row:
if al == '':
continue
al_sl = int(al.split('-')[1])
al_tl = int(al.split('-')[0])
if al_sl != i:
continue
tlword = common.wrap(cur_tl_row[al_tl])
slword = common.wrap(slword)
if slword not in sl_tl_defaults:
print('!', file=sys.stderr)
continue
for j in range(1, MAX_NGRAMS):
pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1]))
postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1]))
roundgram = ' '.join(
map(common.wrap, cur_sl_row[i-j:i+j+1]))
if slword not in ngrams:
ngrams[slword] = {}
if pregram not in ngrams[slword]:
ngrams[slword][pregram] = {}
if postgram not in ngrams[slword]:
ngrams[slword][postgram] = {}
if roundgram not in ngrams[slword]:
ngrams[slword][roundgram] = {}
if tlword not in ngrams[slword][pregram]:
ngrams[slword][pregram][tlword] = 0
if tlword not in ngrams[slword][postgram]:
ngrams[slword][postgram][tlword] = 0
if tlword not in ngrams[slword][roundgram]:
ngrams[slword][roundgram][tlword] = 0
ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1
i = i + 1
cur_line = 0
# print line
continue
line = line.split('\t')[1]
if cur_line == 0:
cur_sl_row = common.tokenise_tagger_line(line)
elif cur_line == 1:
cur_bt_row = common.tokenise_biltrans_line(line)
elif cur_line == 2:
cur_tl_row = common.tokenise_tagger_line(line)
elif cur_line == 3:
cur_al_row = line.split(' ')
cur_line = cur_line + 1
for sl in ngrams:
for ngram in ngrams[sl]:
total = 0
max_freq = -1
current_tl = ''
newtl = sorted(ngrams[sl][ngram], key=lambda x: ngrams[sl][ngram][x])
newtl.reverse()
newtl = newtl[:max_rules]
for tl in newtl:
if ngrams[sl][ngram][tl] > max_freq:
max_freq = ngrams[sl][ngram][tl]
current_tl = tl
total = total + ngrams[sl][ngram][tl]
# > If for each of the rules we include
# > the amount of time the translation is seen with that pattern over the
# > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
# > etc. (>0.6 would be the same as 2/3 of the time the alternative
# > translation is seen with that ngram, and 1/3 of the time the default
# > translation is). I think this would be easier to explain than the magic
# > number I came up with.
#
# I see this as a way to define how "crispy" the decisions are. I think it
# would be better to express this as a ratio: the ratio of the times the
# alternative translation is seen to the number of times the defaullt
# translation is seen with that n-gram.
#
# It would be "2" in this case: the alternative is seen twice as often as
# the default.
for tl in newtl:
crispiness = 0.0
default = sl_tl_defaults[sl]
alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
def_crisp = 1.0
if default in ngrams[sl][ngram]:
def_crisp = float(
ngrams[sl][ngram][default] / float(total))
weight = float(ngrams[sl][ngram][tl]) / float(total)
crispiness = alt_crisp/def_crisp
# print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]
if crispiness < crisphold:
print('-', crispiness, weight, total, max_freq,
ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
else:
print('+', crispiness, weight, total, max_freq,
ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))
if __name__ == '__main__':
if len(sys.argv) < 5:
print(
'Usage: count-patterns.py <lex> <candidates> <crispiness threshold> <max_rules>', file=sys.stderr)
exit(1)
ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
|