File: reduce_model.py

package info (click to toggle)
fasttext 0.9.2-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 7,800 kB
  • sloc: javascript: 10,266; cpp: 5,458; python: 2,425; sh: 616; makefile: 102; xml: 81; perl: 43
file content (98 lines) | stat: -rwxr-xr-x 2,853 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import os
import re
import sys

import fasttext
import fasttext.util

args = None


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def guess_target_name(model_file, initial_dim, target_dim):
    """
    Given a model name with the convention a.<dim>.b, this function
    returns the model's name with `target_dim` value.
    For example model_file name `cc.en.300.bin` with initial dim 300 becomes
    `cc.en.100.bin` when the `target_dim` is 100.
    """
    prg = re.compile("(.*).%s.(.*)" % initial_dim)
    m = prg.match(model_file)
    if m:
        return "%s.%d.%s" % (m.group(1), target_dim, m.group(2))

    sp_ext = os.path.splitext(model_file)
    return "%s.%d%s" % (sp_ext[0], target_dim, sp_ext[1])


def command_reduce(model_file, target_dim, if_exists):
    """
    Given a `model_file`, this function reduces its dimension to `target_dim`
    by applying a PCA.
    """
    eprint("Loading model")

    ft = fasttext.load_model(model_file)
    initial_dim = ft.get_dimension()
    if target_dim >= initial_dim:
        raise Exception("Target dimension (%d) should be less than initial dimension (%d)." % (
            target_dim, initial_dim))

    result_filename = guess_target_name(model_file, initial_dim, target_dim)
    if os.path.isfile(result_filename):
        if if_exists == 'overwrite':
            pass
        elif if_exists == 'strict':
            raise Exception(
                "File already exists. Use --overwrite to overwrite.")
        elif if_exists == 'ignore':
            return result_filename

    eprint("Reducing matrix dimensions")
    fasttext.util.reduce_model(ft, target_dim)

    eprint("Saving model")
    ft.save_model(result_filename)
    eprint("%s saved" % result_filename)

    return result_filename


def main():
    global args

    parser = argparse.ArgumentParser(
        description='fastText helper tool to reduce model dimensions.')
    parser.add_argument("model", type=str,
                        help="model file to reduce. model.bin")
    parser.add_argument("dim", type=int,
                        help="targeted dimension of word vectors.")
    parser.add_argument("--overwrite", action="store_true",
                        help="overwrite if file exists.")

    args = parser.parse_args()

    command_reduce(args.model, args.dim, if_exists=(
        'overwrite' if args.overwrite else 'strict'))


if __name__ == '__main__':
    main()