1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
import sys;
import common;
def wrap (x):
return '^' + x + '$'
sl_tl_defaults = {};
sl_tl = {};
indexes = {};
trad_counter = {};
rindex = {};
with open(sys.argv[1]) as d:
for line in d: #{
if len(line) < 1: #{
continue;
#}
row = common.tokenise_tagger_line(line);
sl = wrap(row[0].strip());
tl = wrap(row[1].strip());
if tl[1] == '*':
tl = tl[:-3] + '$'
if sl not in sl_tl: #{
sl_tl[sl] = [];
#}
if sl not in trad_counter: #{
trad_counter[sl] = 0;
#}
if line.count('@') > 0: #{
sl_tl_defaults[sl] = tl;
#}
sl_tl[sl].append(tl);
indexes[(sl, tl)] = trad_counter[sl];
rindex[(sl, trad_counter[sl])] = tl;
trad_counter[sl] = trad_counter[sl] + 1;
#}
for pair in rindex: #{
print(pair[0], pair[1], rindex[pair], file=sys.stderr);
#}
#ability<n> 0.25652 1 ability<n> to<pr>
#ability<n> 1.54548 0 ability<n> to<pr> deliver<vblex><inf>
#ability<n> 1.48162 0 our<det><pos> ability<n> to<pr>
with open(sys.argv[2]) as d:
for line in d: #{
row = line.split(' \t ');
slword = row[0].strip();
l = float(row[1]);
tlid = int(row[2]);
if (slword, tlid) not in rindex: #{
print ('(', slword, ',', tlid, ') not in index', file=sys.stderr)
continue;
#}
tlword = rindex[(slword, tlid)];
context = row[3].strip();
# #+ 0.571428571429 14 8 8 troiñ<vblex> tourner<vblex> 8
#+nature<n> service<n> nature<n> carácter<n> 3
print('+ ' + row[1] + '\t' + slword + '\t' + context + '\t' + tlword + '\t1');
# print(' <rule weight="%.5f">' % (l));
# for c in context.split(' '): #{
# if c.count(slword) == 1: #{
# print(slword, tlword);
# else: #{
# print(c);
# #}
# #}
# print(' </rule>');
#}
|