File: spm-encode

package info (click to toggle)
sentencepiece 0.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 53,912 kB
  • sloc: cpp: 190,245; python: 1,776; xml: 231; perl: 198; sh: 58; pascal: 50; makefile: 23
file content (26 lines) | stat: -rwxr-xr-x 979 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/bin/sh

set -e

cat <<EOS > input.txt
SentencePiece is an unsupervised text tokenizer and detokenizer mainly
for Neural Network-based text generation systems where the vocabulary
size is predetermined prior to the neural model
training. SentencePiece implements subword units (e.g.,
byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model
[Kudo.]) with the extension of direct training from raw
sentences. SentencePiece allows us to make a purely end-to-end system
that does not depend on language-specific pre/postprocessing.
EOS

rm -f tiny.*
spm_train --input=input.txt --model_prefix=tiny --vocab_size=100 --character_coverage=1.0 >/dev/null 2>&1
encoded=$(echo "I saw a girl with a telescope." | spm_encode --model=tiny.model)
if [ "▁ I ▁s a w ▁a ▁ g ir l ▁w i t h ▁a ▁t el e s c o p e ." = "${encoded}" ]; then
   echo "run spm_encode test"
else
   echo "Failed to spm_encode example: <${encoded}>"
   exit 1
fi
rm -f input.txt
rm -f tiny.*