File: glove_mwe_multipool_predict.py

package info (click to toggle)
python-thinc 6.12.1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,388 kB
  • sloc: python: 7,120; ansic: 6,257; makefile: 19; sh: 11
file content (192 lines) | stat: -rw-r--r-- 7,759 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from __future__ import unicode_literals, print_function
import plac
import spacy
from pathlib import Path
import dill as pickle

from thinc.neural import Model, Softmax, Maxout
from thinc.neural import ExtractWindow
from thinc.neural.pooling import Pooling, mean_pool, max_pool
from thinc.neural._classes.static_vectors import StaticVectors, get_word_ids
from thinc.neural._classes.embed import Embed
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural.util import to_categorical

from thinc.api import layerize, with_flatten, with_getitem, flatten_add_lengths
from thinc.api import add, chain, clone, concatenate, Arg

from thinc.extra import datasets
from thinc.extra.load_nlp import get_spacy, get_vectors


epoch_train_acc = 0.
def track_progress(**context):
    '''Print training progress. Called after each epoch.'''
    model = context['model']
    train_X = context['train_X']
    dev_X = context['dev_X']
    dev_y = context['dev_y']
    n_train = len(train_X)
    trainer = context['trainer']
    def each_epoch():
        global epoch_train_acc
        acc = model.evaluate(dev_X, dev_y)
        with model.use_params(trainer.optimizer.averages):
            avg_acc = model.evaluate(dev_X, dev_y)
        stats = (acc, avg_acc, float(epoch_train_acc) / n_train, trainer.dropout)
        print("%.3f (%.3f) dev acc, %.3f train acc, %.4f drop" % stats)
        epoch_train_acc = 0.
    return each_epoch


def preprocess(ops, nlp, rows, get_ids):
    '''Parse the texts with spaCy. Make one-hot vectors for the labels.'''
    Xs = []
    ys = []
    for (text1, text2), label in rows:
        Xs.append((get_ids([nlp(text1)])[0], get_ids([nlp(text2)])[0]))
        ys.append(label)
    return Xs, to_categorical(ys, nb_classes=2)


@plac.annotations(
    dataset=("Dataset to load"),
    width=("Width of the hidden layers", "option", "w", int),
    depth=("Depth of the hidden layers", "option", "d", int),
    min_batch_size=("Minimum minibatch size during training", "option", "b", int),
    max_batch_size=("Maximum minibatch size during training", "option", "B", int),
    dropout=("Dropout rate", "option", "D", float),
    dropout_decay=("Dropout decay", "option", "C", float),
    use_gpu=("Whether to use GPU", "flag", "G", bool),
    nb_epoch=("Number of epochs", "option", "i", int),
    pieces=("Number of pieces for maxout", "option", "p", int),
    out_loc=("File to save the model", "option", "o"),
    quiet=("Don't print the progress bar", "flag", "q"),
    pooling=("Which pooling to use", "option", "P", str)
)
def main(dataset='quora', width=64, depth=2, min_batch_size=1,
        max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max",
        nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False):
    cfg = dict(locals())
    if out_loc:
        out_loc = Path(out_loc)
        if not out_loc.parent.exists():
            raise IOError("Can't open output location: %s" % out_loc)
    print(cfg)
    if pooling == 'mean+max':
        pool_layer = Pooling(mean_pool, max_pool)
    elif pooling == "mean":
        pool_layer = mean_pool
    elif pooling == "max":
        pool_layer = max_pool
    else:
        raise ValueError("Unrecognised pooling", pooling)


    print("Load spaCy")
    nlp = get_spacy('en')

    #if use_gpu:
    #    Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    with Model.define_operators({'>>': chain, '**': clone, '|': concatenate,
                                 '+': add}):
        mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width*3, pieces=pieces)

        embed = StaticVectors('en', width)# + Embed(width, width*2, 5000)
        # Comments indicate the output type and shape at each step of the pipeline.
        # * B: Number of sentences in the batch
        # * T: Total number of words in the batch
        # (i.e. sum(len(sent) for sent in batch))
        # * W: Width of the network (input hyper-parameter)
        # * ids: ID for each word (integers).
        # * lengths: Number of words in each sentence in the batch (integers)
        # * floats: Standard dense vector.
        # (Dimensions annotated in curly braces.)
        sent2vec = ( # List[spacy.token.Doc]{B}
            flatten_add_lengths  # : (ids{T}, lengths{B})
            >> with_getitem(0,      # : word_ids{T}
                 embed
                 >> mwe_encode ** depth
            ) # : (floats{T, W}, lengths{B})
            >> pool_layer
            >> Maxout(width, pieces=pieces)
            >> Maxout(width, pieces=pieces)
        )
        model = (
            ((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec))
            >> Maxout(width, pieces=pieces)
            >> Maxout(width, pieces=pieces)
            >> Softmax(2)
        )

    print("Read and parse data: %s" % dataset)
    if dataset == 'quora':
        train, dev = datasets.quora_questions()
    elif dataset == 'snli':
        train, dev = datasets.snli()
    elif dataset == 'stackxc':
        train, dev = datasets.stack_exchange()
    elif dataset in ('quora+snli', 'snli+quora'):
        train, dev = datasets.quora_questions()
        train2, dev2 = datasets.snli()
        train.extend(train2)
        dev.extend(dev2)
    else:
        raise ValueError("Unknown dataset: %s" % dataset)
    get_ids = get_word_ids(Model.ops)
    train_X, train_y = preprocess(model.ops, nlp, train, get_ids)
    dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids)

    print("Initialize with data (LSUV)")
    print(dev_y.shape)
    with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        print("Accuracy before training", model.evaluate(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            assert yh.shape == y.shape, (yh.shape, y.shape)
            # No auto-diff: Just get a callback and pass the data through.
            # Hardly a hardship, and it means we don't have to create/maintain
            # a computational graph. We just use closures.

            assert (yh >= 0.).all()
            train_acc = (yh.argmax(axis=1) == y.argmax(axis=1)).sum()
            epoch_train_acc += train_acc

            backprop(yh-y, optimizer)

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
        if out_loc:
            out_loc = Path(out_loc)
            print('Saving to', out_loc)
            with out_loc.open('wb') as file_:
                pickle.dump(model, file_, -1)


if __name__ == '__main__':
    if 1:
        plac.call(main)
    else:
        import cProfile
        import pstats
        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
        s = pstats.Stats("Profile.prof")
        s.strip_dirs().sort_stats("time").print_stats(100)