File: test_quicktour.py

package info (click to toggle)
tokenizers 0.20.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 5,480 kB
  • sloc: python: 4,499; javascript: 419; makefile: 124
file content (197 lines) | stat: -rw-r--r-- 6,376 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from tokenizers import Tokenizer
from ..utils import data_dir, doc_wiki_tokenizer


disable_printing = True
original_print = print


def print(*args, **kwargs):
    if not disable_printing:
        original_print(*args, **kwargs)


class TestQuicktour:
    # This method contains everything we don't want to run
    @staticmethod
    def slow_train():
        tokenizer, trainer = TestQuicktour.get_tokenizer_trainer()

        # START train
        files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
        tokenizer.train(files, trainer)
        # END train
        # START save
        tokenizer.save("data/tokenizer-wiki.json")
        # END save

    @staticmethod
    def get_tokenizer_trainer():
        # START init_tokenizer
        from tokenizers import Tokenizer
        from tokenizers.models import BPE

        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        # END init_tokenizer
        # START init_trainer
        from tokenizers.trainers import BpeTrainer

        trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        # END init_trainer
        # START init_pretok
        from tokenizers.pre_tokenizers import Whitespace

        tokenizer.pre_tokenizer = Whitespace()
        # END init_pretok
        return tokenizer, trainer

    def test_quicktour(self, doc_wiki_tokenizer):
        def print(*args, **kwargs):
            pass

        try:
            # START reload_tokenizer
            tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
            # END reload_tokenizer
        except Exception:
            tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
        # START encode
        output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
        # END encode
        # START print_tokens
        print(output.tokens)
        # ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
        # END print_tokens
        assert output.tokens == [
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
        ]
        # START print_ids
        print(output.ids)
        # [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
        # END print_ids
        assert output.ids == [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
        # START print_offsets
        print(output.offsets[9])
        # (26, 27)
        # END print_offsets
        assert output.offsets[9] == (26, 27)
        # START use_offsets
        sentence = "Hello, y'all! How are you 😁 ?"
        sentence[26:27]
        # "😁"
        # END use_offsets
        assert sentence[26:27] == "😁"
        # START check_sep
        tokenizer.token_to_id("[SEP]")
        # 2
        # END check_sep
        assert tokenizer.token_to_id("[SEP]") == 2
        # START init_template_processing
        from tokenizers.processors import TemplateProcessing

        tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", tokenizer.token_to_id("[CLS]")),
                ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ],
        )
        # END init_template_processing
        # START print_special_tokens
        output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
        print(output.tokens)
        # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
        # END print_special_tokens
        assert output.tokens == [
            "[CLS]",
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
            "[SEP]",
        ]
        # START print_special_tokens_pair
        output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
        print(output.tokens)
        # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
        # END print_special_tokens_pair
        assert output.tokens == [
            "[CLS]",
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "[SEP]",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
            "[SEP]",
        ]
        # START print_type_ids
        print(output.type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
        # END print_type_ids
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
        # START encode_batch
        output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
        # END encode_batch
        # START encode_batch_pair
        output = tokenizer.encode_batch(
            [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
        )
        # END encode_batch_pair
        # START enable_padding
        tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
        # END enable_padding
        # START print_batch_tokens
        output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
        print(output[1].tokens)
        # ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
        # END print_batch_tokens
        assert output[1].tokens == ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
        # START print_attention_mask
        print(output[1].attention_mask)
        # [1, 1, 1, 1, 1, 1, 1, 0]
        # END print_attention_mask
        assert output[1].attention_mask == [1, 1, 1, 1, 1, 1, 1, 0]


if __name__ == "__main__":
    import os
    from urllib import request
    from zipfile import ZipFile

    disable_printing = False
    if not os.path.isdir("data/wikitext-103-raw"):
        print("Downloading wikitext-103...")
        wiki_text, _ = request.urlretrieve(
            "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
        )
        with ZipFile(wiki_text, "r") as z:
            print("Unzipping in data...")
            z.extractall("data")

    print("Now training...")
    TestQuicktour.slow_train()