File: test_base_tokenizer.py

package info (click to toggle)
tokenizers 0.20.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 5,480 kB
  • sloc: python: 4,499; javascript: 419; makefile: 124
file content (30 lines) | stat: -rw-r--r-- 1,456 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
from tokenizers.implementations import BaseTokenizer


class TestBaseTokenizer:
    def test_get_set_components(self):
        toki = Tokenizer(models.BPE())
        toki.normalizer = normalizers.NFC()
        toki.pre_tokenizer = pre_tokenizers.ByteLevel()
        toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
        toki.decoder = decoders.ByteLevel()

        tokenizer = BaseTokenizer(toki)

        assert isinstance(tokenizer.model, models.BPE)
        assert isinstance(tokenizer.normalizer, normalizers.NFC)
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
        assert isinstance(tokenizer.post_processor, processors.BertProcessing)
        assert isinstance(tokenizer.decoder, decoders.ByteLevel)

        tokenizer.model = models.Unigram()
        assert isinstance(tokenizer.model, models.Unigram)
        tokenizer.normalizer = normalizers.NFD()
        assert isinstance(tokenizer.normalizer, normalizers.NFD)
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
        tokenizer.post_processor = processors.ByteLevel()
        assert isinstance(tokenizer.post_processor, processors.ByteLevel)
        tokenizer.decoder = decoders.WordPiece()
        assert isinstance(tokenizer.decoder, decoders.WordPiece)