1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
|
#!/bin/sh
set -e
cat <<EOS > input.txt
SentencePiece is an unsupervised text tokenizer and detokenizer mainly
for Neural Network-based text generation systems where the vocabulary
size is predetermined prior to the neural model
training. SentencePiece implements subword units (e.g.,
byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model
[Kudo.]) with the extension of direct training from raw
sentences. SentencePiece allows us to make a purely end-to-end system
that does not depend on language-specific pre/postprocessing.
EOS
rm -f tiny.*
spm_train --input=input.txt --model_prefix=tiny --vocab_size=100 --character_coverage=1.0 >/dev/null 2>&1
encoded=$(echo "I saw a girl with a telescope." | spm_encode --model=tiny.model)
if [ "▁ I ▁s a w ▁a ▁ g ir l ▁w i t h ▁a ▁t el e s c o p e ." = "${encoded}" ]; then
echo "run spm_encode test"
else
echo "Failed to spm_encode example: <${encoded}>"
exit 1
fi
rm -f input.txt
rm -f tiny.*
|