1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
#!/usr/bin/env python
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division, absolute_import, print_function
from fasttext import load_model
from fasttext import util
import argparse
import numpy as np
def process_question(question, cossims, model, words, vectors):
correct = 0
num_qs = 0
num_lines = 0
for line in question:
num_lines += 1
qwords = line.split()
# We lowercase all words to correspond to the preprocessing
# we applied to our data.
qwords = [x.lower().strip() for x in qwords]
# If one of the words is not in the vocabulary we skip this question
found = True
for w in qwords:
if w not in words:
found = False
break
if not found:
continue
# The first three words form the query
# We retrieve their word vectors and normalize them
query = qwords[:3]
query = [model.get_word_vector(x) for x in query]
query = [x / np.linalg.norm(x) for x in query]
# Get the query vector. Example:
# Germany - Berlin + France
query = query[1] - query[0] + query[2]
# We don't need to rank all the words, only until we found
# the first word not equal to our set of query words.
ban_set = list(map(lambda x: words.index(x), qwords[:3]))
if words[util.find_nearest_neighbor(
query, vectors, ban_set, cossims=cossims
)] == qwords[3]:
correct += 1
num_qs += 1
return correct, num_qs, num_lines
# We use the same conventions as within compute-accuracy
def print_compute_accuracy_score(
question, correct, num_qs, total_accuracy, semantic_accuracy,
syntactic_accuracy
):
print(
(
"{0:>30}: ACCURACY TOP1: {3:.2f} % ({1} / {2})\t Total accuracy: {4:.2f} % Semantic accuracy: {5:.2f} % Syntactic accuracy: {6:.2f} %"
).format(
question,
correct,
num_qs,
correct / float(num_qs) * 100 if num_qs > 0 else 0,
total_accuracy * 100,
semantic_accuracy * 100,
syntactic_accuracy * 100,
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=(
"compute_accuracy equivalent in Python. "
"See https://github.com/tmikolov/word2vec/blob/master/demo-word-accuracy.sh"
)
)
parser.add_argument(
"model",
help="Model to use",
)
parser.add_argument(
"question_words",
help="word questions similar to tmikolov's file (see help for link)",
)
parser.add_argument(
"threshold",
help="threshold used to limit number of words used",
)
args = parser.parse_args()
args.threshold = int(args.threshold)
# Retrieve list of normalized word vectors for the first words up
# until the threshold count.
f = load_model(args.model)
# Gets words with associated frequeny sorted by default by descending order
words, freq = f.get_words(include_freq=True)
words = words[:args.threshold]
vectors = np.zeros((len(words), f.get_dimension()), dtype=float)
for i in range(len(words)):
wv = f.get_word_vector(words[i])
wv = wv / np.linalg.norm(wv)
vectors[i] = wv
total_correct = 0
total_qs = 0
total_num_lines = 0
total_se_correct = 0
total_se_qs = 0
total_sy_correct = 0
total_sy_qs = 0
qid = 0
questions = []
with open(args.question_words, 'r') as fqw:
questions = fqw.read().split(':')[1:]
# For efficiency preallocate the memory to calculate cosine similarities
cossims = np.zeros(len(words), dtype=float)
for question in questions:
quads = question.split('\n')
question = quads[0].strip()
quads = quads[1:-1]
correct, num_qs, num_lines = process_question(
quads, cossims, f, words, vectors
)
total_qs += num_qs
total_correct += correct
total_num_lines += num_lines
if (qid < 5):
total_se_correct += correct
total_se_qs += num_qs
else:
total_sy_correct += correct
total_sy_qs += num_qs
print_compute_accuracy_score(
question,
correct,
num_qs,
total_correct / float(total_qs) if total_qs > 0 else 0,
total_se_correct / float(total_se_qs) if total_se_qs > 0 else 0,
total_sy_correct / float(total_sy_qs) if total_sy_qs > 0 else 0,
)
qid += 1
print(
"Questions seen / total: {0} {1} {2:.2f} %".
format(
total_qs,
total_num_lines,
total_qs / total_num_lines * 100 if total_num_lines > 0 else 0,
)
)
|