File: spm-encode

package info (click to toggle)

sentencepiece 0.2.1-1

links: PTS, VCS
area: main
in suites: forky, sid
size: 53,912 kB
sloc: cpp: 190,245; python: 1,776; xml: 231; perl: 198; sh: 58; pascal: 50; makefile: 23

file content (26 lines) | stat: -rwxr-xr-x 979 bytes

parent folder | download | duplicates (2)

#!/bin/sh

set -e

cat <<EOS > input.txt
SentencePiece is an unsupervised text tokenizer and detokenizer mainly
for Neural Network-based text generation systems where the vocabulary
size is predetermined prior to the neural model
training. SentencePiece implements subword units (e.g.,
byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model
[Kudo.]) with the extension of direct training from raw
sentences. SentencePiece allows us to make a purely end-to-end system
that does not depend on language-specific pre/postprocessing.
EOS

rm -f tiny.*
spm_train --input=input.txt --model_prefix=tiny --vocab_size=100 --character_coverage=1.0 >/dev/null 2>&1
encoded=$(echo "I saw a girl with a telescope." | spm_encode --model=tiny.model)
if [ "▁ I ▁s a w ▁a ▁ g ir l ▁w i t h ▁a ▁t el e s c o p e ." = "${encoded}" ]; then
   echo "run spm_encode test"
else
   echo "Failed to spm_encode example: <${encoded}>"
   exit 1
fi
rm -f input.txt
rm -f tiny.*