1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
|
# SentencePiece Python Wrapper
Python wrapper for SentencePiece. This API will offer the encoding, decoding and training of Sentencepiece.
## Build and Install SentencePiece
For Linux (x64/i686), macOS, and Windows(win32/x64/arm64) environment, you can simply use pip command to install SentencePiece python module.
```
% pip install sentencepiece
```
Before building SentencePiece from source on Linux, ensure that the following dependencies are installed.
```
% sudo apt update
% sudo apt install -y cmake pkg-config libsentencepiece-dev
```
To build and install the Python wrapper from source, try the following commands to build and install wheel package.
```
% git clone https://github.com/google/sentencepiece.git
% cd sentencepiece
% mkdir build
% cd build
% cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root -DSPM_DISABLE_EMBEDDED_DATA=ON
% make install
% cd ../python
% python setup.py bdist_wheel
% pip install dist/sentencepiece*.whl
```
If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try:
```
% python setup.py install --user
```
For Windows users who want to build from source, you can build and install the Python wrapper using Visual Studio. First, you need to install the `pwsh.exe` (Powershell 7). Use `winget install --id Microsoft.Powershell --source winget` to install directly. Then open the `Developer PowerShell for VS 2022`, and execute the following commands.
```
git clone https://github.com/google/sentencepiece.git
cd sentencepiece
mkdir build
cd build
cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=".\root" -DSPM_DISABLE_EMBEDDED_DATA=ON
cmake --build . --config Release --target install
cd ../python
pip install wheel
python setup.py bdist_wheel
Get-ChildItem .\dist\sentencepiece*.whl | ForEach-Object { pip install $_.FullName }
```
## Usage
See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively.
### Segmentation
```
% python
>>> import sentencepiece as spm
>>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model')
>>> sp.encode('This is a test')
[284, 47, 11, 4, 15, 400]
>>> sp.encode(['This is a test', 'Hello world'], out_type=int)
[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
>>> sp.encode_as_ids(['This is a test', 'Hello world'])
[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
>>> sp.encode('This is a test', out_type=str)
['▁This', '▁is', '▁a', '▁', 't', 'est']
>>> sp.encode(['This is a test', 'Hello world'], out_type=str)
[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
>>> sp.encode_as_pieces(['This is a test', 'Hello world'])
[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
>>> proto = sp.encode('This is a test', out_type='immutable_proto')
>>> for n in proto.pieces:
... print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end))
...
piece="▁This" surface="This" id=284 begin=0 end=4
piece="▁is" surface=" is" id=47 begin=4 end=7
piece="▁a" surface=" a" id=11 begin=7 end=9
piece="▁" surface=" " id=4 begin=9 end=10
piece="t" surface="t" id=15 begin=10 end=11
piece="est" surface="est" id=400 begin=11 end=14
>>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]]
[[284, 47, 11, 4, 15, 400], ['▁This', '▁is', '▁a', '▁', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]]
>>> proto2 = sp.encode_as_immutable_proto('This is a test')
>>> proto2 == proto
True
>>> for _ in range(10):
... sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
...
['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st']
['▁T', 'h', 'i', 's', '▁is', '▁a', '▁', 'te', 's', 't']
['▁T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 't', 'est']
['▁', 'This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
['▁', 'This', '▁', 'is', '▁', 'a', '▁', 't', 'e', 's', 't']
['▁This', '▁is', '▁a', '▁', 'te', 's', 't']
['▁This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
['▁', 'T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 'te', 'st']
['▁', 'This', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st']
['▁This', '▁', 'is', '▁a', '▁', 't', 'est']
>> sp.nbest_encode('This is a test', nbest_size=5, out_type=str)
[['▁This', '▁is', '▁a', '▁', 't', 'est'],
['▁This', '▁is', '▁a', '▁', 'te', 'st'],
['▁This', '▁is', '▁a', '▁', 'te', 's', 't'],
['▁This', '▁is', '▁a', '▁', 't', 'e', 'st'],
['▁This', '▁is', '▁a', '▁', 't', 'es', 't']]
>>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True)
[(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't'], -3.043105125427246),
(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 'st'], -2.8475849628448486),
(['▁', 'This', '▁is', '▁', 'a', '▁', 'te', 'st'], -3.043248176574707),
(['▁', 'This', '▁is', '▁a', '▁', 't', 'e', 'st'], -2.87727689743042),
(['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'est'], -3.6284031867980957)]
>>> sp.decode([284, 47, 11, 4, 15, 400])
'This is a test'
>>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]])
['This is a test', 'Hello world']
>>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto')
>>> proto.text
'This is a test'
>>> sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st'])
'This is a test'
>>> sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']])
['This is a test', 'Hello world']
>>> sp.get_piece_size()
1000
>>> sp.id_to_piece(2)
'</s>'
>>> sp.id_to_piece([2, 3, 4])
['</s>', '\r', '▁']
>>> sp.piece_to_id('<s>')
1
>>> sp.piece_to_id(['</s>', '\r', '▁'])
[2, 3, 4]
>>> len(sp)
1000
>>> sp['</s>']
2
```
### Model Training
Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.train() function.
```
>>> import sentencepiece as spm
>>> spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=['foo', 'bar'])
sentencepiece_trainer.cc(73) LOG(INFO) Starts training with :
trainer_spec {
input: test/botchan.txt
.. snip
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1188 obj=10.2839 num_tokens=32182 num_tokens/piece=27.0892
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=0 size=1100 obj=10.4269 num_tokens=33001 num_tokens/piece=30.0009
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4069 num_tokens=33002 num_tokens/piece=30.0018
trainer_interface.cc(595) LOG(INFO) Saving model: m.model
trainer_interface.cc(619) LOG(INFO) Saving vocabs: m.vocab
>>>
```
### Training without local filesystem
Sentencepiece trainer can receive any iterable object to feed training sentences. You can also pass a file object (instance with write() method) to emit the output model to any devices. These features are useful to run sentencepiece on environment that have limited access to the local file system (e.g., Google colab.)
```
import urllib.request
import io
import sentencepiece as spm
# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
with urllib.request.urlopen(
'https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt'
) as response:
spm.SentencePieceTrainer.train(
sentence_iterator=response, model_writer=model, vocab_size=1000)
# Serialize the model as file.
# with open('out.model', 'wb') as f:
# f.write(model.getvalue())
# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))
```
### Free Threading support
Experimental support for no-GIL/Free-Threading has been introduced since v0.2.1. For more details, please refer to [this page](https://py-free-threading.github.io.).
This operates similarly to how [NumPy](https://numpy.org/devdocs/reference/thread_safety.html#free-threaded-python) handles it.
The C++ library's const and static methods, e.g., encode(), decode() and train(), are designed to work in a non-GIL environment.
However, non-const methods, e.g., load(), may have potential data race issues, so please ensure you implement appropriate locks beforehand.
While this limitation might be removed in the future, please note that it's not a simple fix, as it would require additional shared locks in C++.
|