1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import io
import numpy as np
import collections
def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
if verbose:
print("Loading vectors from %s" % fname)
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
if maxload > 0:
n = min(n, maxload)
x = np.zeros([n, d])
words = []
for i, line in enumerate(fin):
if i >= n:
break
tokens = line.rstrip().split(' ')
words.append(tokens[0])
v = np.array(tokens[1:], dtype=float)
x[i, :] = v
if norm:
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
if center:
x -= x.mean(axis=0)[np.newaxis, :]
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
if verbose:
print("%d word vectors loaded" % (len(words)))
return words, x
def idx(words):
w2i = {}
for i, w in enumerate(words):
if w not in w2i:
w2i[w] = i
return w2i
def save_vectors(fname, x, words):
n, d = x.shape
fout = io.open(fname, 'w', encoding='utf-8')
fout.write(u"%d %d\n" % (n, d))
for i in range(n):
fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
fout.close()
def save_matrix(fname, x):
n, d = x.shape
fout = io.open(fname, 'w', encoding='utf-8')
fout.write(u"%d %d\n" % (n, d))
for i in range(n):
fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
fout.close()
def procrustes(X_src, Y_tgt):
U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
return np.dot(U, V)
def select_vectors_from_pairs(x_src, y_tgt, pairs):
n = len(pairs)
d = x_src.shape[1]
x = np.zeros([n, d])
y = np.zeros([n, d])
for k, ij in enumerate(pairs):
i, j = ij
x[k, :] = x_src[i, :]
y[k, :] = y_tgt[j, :]
return x, y
def load_lexicon(filename, words_src, words_tgt, verbose=True):
f = io.open(filename, 'r', encoding='utf-8')
lexicon = collections.defaultdict(set)
idx_src , idx_tgt = idx(words_src), idx(words_tgt)
vocab = set()
for line in f:
word_src, word_tgt = line.split()
if word_src in idx_src and word_tgt in idx_tgt:
lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
vocab.add(word_src)
if verbose:
coverage = len(lexicon) / float(len(vocab))
print("Coverage of source vocab: %.4f" % (coverage))
return lexicon, float(len(vocab))
def load_pairs(filename, idx_src, idx_tgt, verbose=True):
f = io.open(filename, 'r', encoding='utf-8')
pairs = []
tot = 0
for line in f:
a, b = line.rstrip().split(' ')
tot += 1
if a in idx_src and b in idx_tgt:
pairs.append((idx_src[a], idx_tgt[b]))
if verbose:
coverage = (1.0 * len(pairs)) / tot
print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
return pairs
def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
if lexicon_size < 0:
lexicon_size = len(lexicon)
idx_src = list(lexicon.keys())
acc = 0.0
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
for i in range(0, len(idx_src), bsz):
e = min(i + bsz, len(idx_src))
scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
pred = scores.argmax(axis=0)
for j in range(i, e):
if pred[j - i] in lexicon[idx_src[j]]:
acc += 1.0
return acc / lexicon_size
def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
if lexicon_size < 0:
lexicon_size = len(lexicon)
idx_src = list(lexicon.keys())
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
sr = x_src[list(idx_src)]
sc = np.dot(sr, x_tgt.T)
similarities = 2 * sc
sc2 = np.zeros(x_tgt.shape[0])
for i in range(0, x_tgt.shape[0], bsz):
j = min(i + bsz, x_tgt.shape[0])
sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
sc2[i:j] = np.mean(dotprod, axis=1)
similarities -= sc2[np.newaxis, :]
nn = np.argmax(similarities, axis=1).tolist()
correct = 0.0
for k in range(0, len(lexicon)):
if nn[k] in lexicon[idx_src[k]]:
correct += 1.0
return correct / lexicon_size
|