1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
|
import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
class TestSentencePieceBPE:
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["βA", "βsentence"]
class TestSentencePieceUnigram:
def test_train(self, tmpdir):
p = tmpdir.mkdir("tmpdir").join("file.txt")
p.write("A first sentence\nAnother sentence\nAnd a last one")
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(files=str(p), show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["βA", "β", "s", "en", "t", "en", "c", "e"]
with pytest.raises(Exception) as excinfo:
_ = tokenizer.encode("A sentence π€")
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
def test_train_with_unk_token(self, tmpdir):
p = tmpdir.mkdir("tmpdir").join("file.txt")
p.write("A first sentence\nAnother sentence\nAnd a last one")
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>")
output = tokenizer.encode("A sentence π€")
assert output.ids[-1] == 0
assert output.tokens == ["βA", "β", "s", "en", "t", "en", "c", "e", "β", "π€"]
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["βA", "β", "s", "en", "t", "en", "c", "e"]
with pytest.raises(Exception) as excinfo:
_ = tokenizer.encode("A sentence π€")
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
def test_train_from_iterator_with_unk_token(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train_from_iterator(
text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
)
output = tokenizer.encode("A sentence π€")
assert output.ids[-1] == 0
assert output.tokens == ["βA", "β", "s", "en", "t", "en", "c", "e", "β", "π€"]
|