1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
|
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from fasttext import load_model
from fasttext import tokenize
import sys
import time
import tempfile
import argparse
def get_word_vector(data, model):
t1 = time.time()
print("Reading")
with open(data, 'r') as f:
tokens = tokenize(f.read())
t2 = time.time()
print("Read TIME: " + str(t2 - t1))
print("Read NUM : " + str(len(tokens)))
f = load_model(model)
# This is not equivalent to piping the data into
# print-word-vector, because the data is tokenized
# first.
t3 = time.time()
i = 0
for t in tokens:
f.get_word_vector(t)
i += 1
if i % 10000 == 0:
sys.stderr.write("\ri: " + str(float(i / len(tokens))))
sys.stderr.flush()
t4 = time.time()
print("\nVectoring: " + str(t4 - t3))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Simple benchmark for get_word_vector.')
parser.add_argument('model', help='A model file to use for benchmarking.')
parser.add_argument('data', help='A data file to use for benchmarking.')
args = parser.parse_args()
get_word_vector(args.data, args.model)
|