File: get_word_vector.py

package info (click to toggle)
fasttext 0.9.2%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,900 kB
  • sloc: cpp: 5,458; python: 2,425; javascript: 635; sh: 616; makefile: 102; xml: 81; perl: 43
file content (50 lines) | stat: -rw-r--r-- 1,502 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from fasttext import load_model
from fasttext import tokenize
import sys
import time
import tempfile
import argparse


def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Simple benchmark for get_word_vector.')
    parser.add_argument('model', help='A model file to use for benchmarking.')
    parser.add_argument('data', help='A data file to use for benchmarking.')
    args = parser.parse_args()
    get_word_vector(args.data, args.model)