1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com>
Date: Tue, 9 Aug 2022 15:15:30 +0300
Subject: Fixed errors in example notebook
Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>
---
python/sentencepiece_python_module_example.ipynb | 44 ++++++++++--------------
1 file changed, 19 insertions(+), 25 deletions(-)
diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb
index 78464d1..1eb0f9c 100644
--- a/python/sentencepiece_python_module_example.ipynb
+++ b/python/sentencepiece_python_module_example.ipynb
@@ -216,7 +216,7 @@
"import tensorflow as tf\n",
"\n",
"# Assumes that m.model is stored in non-Posix file system.\n",
- "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+ "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
"\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load_from_serialized_proto(serialized_model_proto)\n",
@@ -265,7 +265,7 @@
},
"cell_type": "code",
"source": [
- "## Example of user defined symbols\n",
+ "# Example of user defined symbols\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
"\n",
"sp_user = spm.SentencePieceProcessor()\n",
@@ -307,7 +307,7 @@
},
"cell_type": "code",
"source": [
- "## Example of control symbols\n",
+ "# Example of control symbols\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
"\n",
"sp_ctrl = spm.SentencePieceProcessor()\n",
@@ -564,7 +564,7 @@
"spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load('m.model')\n",
- "print(sp.decode_ids([sp.unk_id()])) "
+ "print(sp.decode_ids([sp.unk_id()]))"
],
"execution_count": 0,
"outputs": [
@@ -608,7 +608,7 @@
"# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
"for n in range(10):\n",
" print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
- " \n",
+ "\n",
"for n in range(10):\n",
" print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
],
@@ -858,8 +858,6 @@
},
"cell_type": "code",
"source": [
- "import sentencepiece as spm\n",
- "\n",
"# NFKC normalization and lower casing.\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
"\n",
@@ -903,11 +901,12 @@
},
"cell_type": "code",
"source": [
- "def tocode(s): \n",
- " out = [] \n",
- " for c in s: \n",
- " out.append(str(hex(ord(c))).replace('0x', 'U+')) \n",
- " return ' '.join(out) \n",
+ "def tocode(s):\n",
+ " out = []\n",
+ " for c in s:\n",
+ " out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+ " return ' '.join(out)\n",
+ "\n",
"\n",
"# TSV format: source Unicode code points <tab> target code points\n",
"# normalize \"don't => do not, I'm => I am\"\n",
@@ -923,7 +922,7 @@
"# m.model embeds the normalization rule compiled into an FST.\n",
"sp.load('m.model')\n",
"print(sp.encode_as_pieces(\"I'm busy\")) # normalzied to `I am busy'\n",
- "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'\n"
+ "print(sp.encode_as_pieces(\"I don't know it.\")) # normalized to 'I do not know it.'"
],
"execution_count": 0,
"outputs": [
@@ -1029,9 +1028,9 @@
" for piece in sp.encode_as_pieces(line):\n",
" freq.setdefault(piece, 0)\n",
" freq[piece] += 1\n",
- " \n",
+ "\n",
"# only uses the token appearing more than 1000 times in the training data.\n",
- "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+ "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
"sp.set_vocabulary(vocabs)\n",
"print(sp.encode_as_pieces('this is a test.'))\n",
"\n",
@@ -1133,20 +1132,17 @@
},
"cell_type": "code",
"source": [
- "freq={}\n",
+ "freq = {}\n",
"with open('botchan.txt', 'r') as f:\n",
" for line in f:\n",
" line = line.rstrip()\n",
" for piece in line.split():\n",
" freq.setdefault(piece, 0)\n",
" freq[piece] += 1\n",
- " \n",
+ "\n",
"with open('word_freq_list.tsv', 'w') as f:\n",
" for k, v in freq.items():\n",
" f.write('%s\\t%d\\n' % (k, v))\n",
- " \n",
- "\n",
- "import sentencepiece as spm\n",
"\n",
"spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
"sp = spm.SentencePieceProcessor()\n",
@@ -1176,7 +1172,7 @@
"\n",
"Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
"\n",
- "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+ "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
"**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
"\n",
"The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
@@ -1194,8 +1190,7 @@
},
"cell_type": "code",
"source": [
- "!pip install protobuf\n",
- "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+ "!pip install protobuf"
],
"execution_count": 0,
"outputs": [
@@ -1233,8 +1228,7 @@
},
"cell_type": "code",
"source": [
- "import sentencepiece_pb2\n",
- "import sentencepiece as spm\n",
+ "from sentencepiece import sentencepiece_pb2\n",
"\n",
"spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
"\n",
|