File: 0021-Fixed-errors-in-example-notebook.patch

package info (click to toggle)
sentencepiece 0.1.97-3
links: PTS, VCS
area: main
in suites: bookworm
size: 54,944 kB
sloc: cpp: 186,945; python: 1,409; xml: 231; perl: 198; pascal: 50; makefile: 23; sh: 13
file content (158 lines) | stat: -rw-r--r-- 7,281 bytes
From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com>
Date: Tue, 9 Aug 2022 15:15:30 +0300
Subject: Fixed errors in example notebook

Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>
---
 python/sentencepiece_python_module_example.ipynb | 44 ++++++++++--------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb
index 78464d1..1eb0f9c 100644
--- a/python/sentencepiece_python_module_example.ipynb
+++ b/python/sentencepiece_python_module_example.ipynb
@@ -216,7 +216,7 @@
         "import tensorflow as tf\n",
         "\n",
         "# Assumes that m.model is stored in non-Posix file system.\n",
-        "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+        "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
         "\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load_from_serialized_proto(serialized_model_proto)\n",
@@ -265,7 +265,7 @@
       },
       "cell_type": "code",
       "source": [
-        "## Example of user defined symbols\n",
+        "# Example of user defined symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_user = spm.SentencePieceProcessor()\n",
@@ -307,7 +307,7 @@
       },
       "cell_type": "code",
       "source": [
-        "## Example of control symbols\n",
+        "# Example of control symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_ctrl = spm.SentencePieceProcessor()\n",
@@ -564,7 +564,7 @@
         "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load('m.model')\n",
-        "print(sp.decode_ids([sp.unk_id()])) "
+        "print(sp.decode_ids([sp.unk_id()]))"
       ],
       "execution_count": 0,
       "outputs": [
@@ -608,7 +608,7 @@
         "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
-        "  \n",
+        "\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
       ],
@@ -858,8 +858,6 @@
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece as spm\n",
-        "\n",
         "# NFKC normalization and lower casing.\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
         "\n",
@@ -903,11 +901,12 @@
       },
       "cell_type": "code",
       "source": [
-        "def tocode(s):                                                                               \n",
-        "    out = []                                                                                 \n",
-        "    for c in s:                                                                              \n",
-        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))                                     \n",
-        "    return ' '.join(out)          \n",
+        "def tocode(s):\n",
+        "    out = []\n",
+        "    for c in s:\n",
+        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+        "    return ' '.join(out)\n",
+        "\n",
         "\n",
         "# TSV format:  source Unicode code points <tab> target code points\n",
         "# normalize \"don't => do not,  I'm => I am\"\n",
@@ -923,7 +922,7 @@
         "# m.model embeds the normalization rule compiled into an FST.\n",
         "sp.load('m.model')\n",
         "print(sp.encode_as_pieces(\"I'm busy\"))  # normalzied to `I am busy'\n",
-        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'\n"
+        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'"
       ],
       "execution_count": 0,
       "outputs": [
@@ -1029,9 +1028,9 @@
         "        for piece in sp.encode_as_pieces(line):\n",
         "            freq.setdefault(piece, 0)\n",
         "            freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "# only uses the token appearing more than 1000 times in the training data.\n",
-        "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+        "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
         "sp.set_vocabulary(vocabs)\n",
         "print(sp.encode_as_pieces('this is a test.'))\n",
         "\n",
@@ -1133,20 +1132,17 @@
       },
       "cell_type": "code",
       "source": [
-        "freq={}\n",
+        "freq = {}\n",
         "with open('botchan.txt', 'r') as f:\n",
         "  for line in f:\n",
         "    line = line.rstrip()\n",
         "    for piece in line.split():\n",
         "      freq.setdefault(piece, 0)\n",
         "      freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "with open('word_freq_list.tsv', 'w') as f:\n",
         "  for k, v in freq.items():\n",
         "    f.write('%s\\t%d\\n' % (k, v))\n",
-        "  \n",
-        "\n",
-        "import sentencepiece as spm\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
         "sp = spm.SentencePieceProcessor()\n",
@@ -1176,7 +1172,7 @@
         "\n",
         "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
         "\n",
-        "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+        "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
         "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
         "\n",
         "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
@@ -1194,8 +1190,7 @@
       },
       "cell_type": "code",
       "source": [
-        "!pip install protobuf\n",
-        "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+        "!pip install protobuf"
       ],
       "execution_count": 0,
       "outputs": [
@@ -1233,8 +1228,7 @@
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece_pb2\n",
-        "import sentencepiece as spm\n",
+        "from sentencepiece import sentencepiece_pb2\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
         "\n",