File: test_pipeline.py

package info (click to toggle)
tokenizers 0.20.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 5,480 kB
  • sloc: python: 4,499; javascript: 419; makefile: 124
file content (187 lines) | stat: -rw-r--r-- 7,170 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from tokenizers import Tokenizer

from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer

disable_printing = True
original_print = print


def print(*args, **kwargs):
    if not disable_printing:
        original_print(*args, **kwargs)


class TestPipeline:
    def test_pipeline(self, doc_wiki_tokenizer):
        try:
            # START reload_tokenizer
            from tokenizers import Tokenizer

            tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
            # END reload_tokenizer
        except Exception:
            tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)

        # START setup_normalizer
        from tokenizers import normalizers
        from tokenizers.normalizers import NFD, StripAccents

        normalizer = normalizers.Sequence([NFD(), StripAccents()])
        # END setup_normalizer
        # START test_normalizer
        normalizer.normalize_str("Héllò hôw are ü?")
        # "Hello how are u?"
        # END test_normalizer
        assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
        # START replace_normalizer
        tokenizer.normalizer = normalizer
        # END replace_normalizer
        # START setup_pre_tokenizer
        from tokenizers.pre_tokenizers import Whitespace

        pre_tokenizer = Whitespace()
        pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
        # [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
        #  ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
        #  (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]
        # END setup_pre_tokenizer
        assert pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") == [
            ("Hello", (0, 5)),
            ("!", (5, 6)),
            ("How", (7, 10)),
            ("are", (11, 14)),
            ("you", (15, 18)),
            ("?", (18, 19)),
            ("I", (20, 21)),
            ("'", (21, 22)),
            ("m", (22, 23)),
            ("fine", (24, 28)),
            (",", (28, 29)),
            ("thank", (30, 35)),
            ("you", (36, 39)),
            (".", (39, 40)),
        ]
        # START combine_pre_tokenizer
        from tokenizers import pre_tokenizers
        from tokenizers.pre_tokenizers import Digits

        pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
        pre_tokenizer.pre_tokenize_str("Call 911!")
        # [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]
        # END combine_pre_tokenizer
        assert pre_tokenizer.pre_tokenize_str("Call 911!") == [
            ("Call", (0, 4)),
            ("9", (5, 6)),
            ("1", (6, 7)),
            ("1", (7, 8)),
            ("!", (8, 9)),
        ]
        # START replace_pre_tokenizer
        tokenizer.pre_tokenizer = pre_tokenizer
        # END replace_pre_tokenizer
        # START setup_processor
        from tokenizers.processors import TemplateProcessing

        tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
        )
        # END setup_processor
        # START test_decoding
        output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
        print(output.ids)
        # [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]

        tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
        # "Hello , y ' all ! How are you ?"
        # END test_decoding
        assert output.ids == [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
        assert (
            tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
            == "Hello , y ' all ! How are you ?"
        )

    @staticmethod
    def slow_train():
        # START bert_setup_tokenizer
        from tokenizers import Tokenizer
        from tokenizers.models import WordPiece

        bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        # END bert_setup_tokenizer
        # START bert_setup_normalizer
        from tokenizers import normalizers
        from tokenizers.normalizers import NFD, Lowercase, StripAccents

        bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        # END bert_setup_normalizer
        # START bert_setup_pre_tokenizer
        from tokenizers.pre_tokenizers import Whitespace

        bert_tokenizer.pre_tokenizer = Whitespace()
        # END bert_setup_pre_tokenizer
        # START bert_setup_processor
        from tokenizers.processors import TemplateProcessing

        bert_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )
        # END bert_setup_processor
        # START bert_train_tokenizer
        from tokenizers.trainers import WordPieceTrainer

        trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
        bert_tokenizer.train(files, trainer)

        bert_tokenizer.save("data/bert-wiki.json")
        # END bert_train_tokenizer

    def test_bert_example(self, doc_pipeline_bert_tokenizer):
        try:
            bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json")
        except Exception:
            bert_tokenizer = Tokenizer.from_file(doc_pipeline_bert_tokenizer)

        # START bert_test_decoding
        output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
        print(output.tokens)
        # ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]

        bert_tokenizer.decode(output.ids)
        # "welcome to the tok ##eni ##zer ##s library ."
        # END bert_test_decoding
        assert bert_tokenizer.decode(output.ids) == "welcome to the tok ##eni ##zer ##s library ."
        # START bert_proper_decoding
        from tokenizers import decoders

        bert_tokenizer.decoder = decoders.WordPiece()
        bert_tokenizer.decode(output.ids)
        # "welcome to the tokenizers library."
        # END bert_proper_decoding
        assert bert_tokenizer.decode(output.ids) == "welcome to the tokenizers library."


if __name__ == "__main__":
    import os
    from urllib import request
    from zipfile import ZipFile

    disable_printing = False
    if not os.path.isdir("data/wikitext-103-raw"):
        print("Downloading wikitext-103...")
        wiki_text, _ = request.urlretrieve(
            "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
        )
        with ZipFile(wiki_text, "r") as z:
            print("Unzipping in data...")
            z.extractall("data")

    print("Now training...")
    TestPipeline.slow_train()