File: 0021-Fixed-errors-in-example-notebook.patch

package info (click to toggle)
sentencepiece 0.1.97-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 54,944 kB
  • sloc: cpp: 186,945; python: 1,409; xml: 231; perl: 198; pascal: 50; makefile: 23; sh: 13
file content (158 lines) | stat: -rw-r--r-- 7,281 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
From: Aleksey Morozov <36787333+amrzv@users.noreply.github.com>
Date: Tue, 9 Aug 2022 15:15:30 +0300
Subject: Fixed errors in example notebook

Signed-off-by: Kentaro Hayashi <kenhys@gmail.com>
---
 python/sentencepiece_python_module_example.ipynb | 44 ++++++++++--------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/python/sentencepiece_python_module_example.ipynb b/python/sentencepiece_python_module_example.ipynb
index 78464d1..1eb0f9c 100644
--- a/python/sentencepiece_python_module_example.ipynb
+++ b/python/sentencepiece_python_module_example.ipynb
@@ -216,7 +216,7 @@
         "import tensorflow as tf\n",
         "\n",
         "# Assumes that m.model is stored in non-Posix file system.\n",
-        "serialized_model_proto = tf.gfile.GFile('m.model', 'rb').read()\n",
+        "serialized_model_proto = tf.io.gfile.GFile('m.model', 'rb').read()\n",
         "\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load_from_serialized_proto(serialized_model_proto)\n",
@@ -265,7 +265,7 @@
       },
       "cell_type": "code",
       "source": [
-        "## Example of user defined symbols\n",
+        "# Example of user defined symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_user --user_defined_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_user = spm.SentencePieceProcessor()\n",
@@ -307,7 +307,7 @@
       },
       "cell_type": "code",
       "source": [
-        "## Example of control symbols\n",
+        "# Example of control symbols\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m_ctrl --control_symbols=<sep>,<cls> --vocab_size=2000')\n",
         "\n",
         "sp_ctrl = spm.SentencePieceProcessor()\n",
@@ -564,7 +564,7 @@
         "spm.SentencePieceTrainer.train('--input=botchan.txt --vocab_size=2000 --model_prefix=m --unk_surface=__UNKNOWN__')\n",
         "sp = spm.SentencePieceProcessor()\n",
         "sp.load('m.model')\n",
-        "print(sp.decode_ids([sp.unk_id()])) "
+        "print(sp.decode_ids([sp.unk_id()]))"
       ],
       "execution_count": 0,
       "outputs": [
@@ -608,7 +608,7 @@
         "# There are two hyperparamenters for sampling (nbest_size and inverse temperature). see the paper [kudo18] for detail.\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_pieces('hello world', -1, 0.1))\n",
-        "  \n",
+        "\n",
         "for n in range(10):\n",
         "  print(sp.sample_encode_as_ids('hello world', -1, 0.1))"
       ],
@@ -858,8 +858,6 @@
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece as spm\n",
-        "\n",
         "# NFKC normalization and lower casing.\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000 --normalization_rule_name=nfkc_cf')\n",
         "\n",
@@ -903,11 +901,12 @@
       },
       "cell_type": "code",
       "source": [
-        "def tocode(s):                                                                               \n",
-        "    out = []                                                                                 \n",
-        "    for c in s:                                                                              \n",
-        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))                                     \n",
-        "    return ' '.join(out)          \n",
+        "def tocode(s):\n",
+        "    out = []\n",
+        "    for c in s:\n",
+        "        out.append(str(hex(ord(c))).replace('0x', 'U+'))\n",
+        "    return ' '.join(out)\n",
+        "\n",
         "\n",
         "# TSV format:  source Unicode code points <tab> target code points\n",
         "# normalize \"don't => do not,  I'm => I am\"\n",
@@ -923,7 +922,7 @@
         "# m.model embeds the normalization rule compiled into an FST.\n",
         "sp.load('m.model')\n",
         "print(sp.encode_as_pieces(\"I'm busy\"))  # normalzied to `I am busy'\n",
-        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'\n"
+        "print(sp.encode_as_pieces(\"I don't know it.\"))  # normalized to 'I do not know it.'"
       ],
       "execution_count": 0,
       "outputs": [
@@ -1029,9 +1028,9 @@
         "        for piece in sp.encode_as_pieces(line):\n",
         "            freq.setdefault(piece, 0)\n",
         "            freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "# only uses the token appearing more than 1000 times in the training data.\n",
-        "vocabs = list(filter(lambda x : x in freq and freq[x] > 1000, vocabs))\n",
+        "vocabs = list(filter(lambda x: x in freq and freq[x] > 1000, vocabs))\n",
         "sp.set_vocabulary(vocabs)\n",
         "print(sp.encode_as_pieces('this is a test.'))\n",
         "\n",
@@ -1133,20 +1132,17 @@
       },
       "cell_type": "code",
       "source": [
-        "freq={}\n",
+        "freq = {}\n",
         "with open('botchan.txt', 'r') as f:\n",
         "  for line in f:\n",
         "    line = line.rstrip()\n",
         "    for piece in line.split():\n",
         "      freq.setdefault(piece, 0)\n",
         "      freq[piece] += 1\n",
-        "            \n",
+        "\n",
         "with open('word_freq_list.tsv', 'w') as f:\n",
         "  for k, v in freq.items():\n",
         "    f.write('%s\\t%d\\n' % (k, v))\n",
-        "  \n",
-        "\n",
-        "import sentencepiece as spm\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=word_freq_list.tsv --input_format=tsv --model_prefix=m --vocab_size=2000')\n",
         "sp = spm.SentencePieceProcessor()\n",
@@ -1176,7 +1172,7 @@
         "\n",
         "Sentencepiece keeps track of byte offset (span) of each token, which is useful for highlighting the token on top of unnormalized text.\n",
         "\n",
-        "We first need to install protobuf module and sentencepiece_pb2.py as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
+        "We first need to install protobuf module as the byte offsets and all other meta data for segementation are encoded in protocol buffer.\n",
         "**encode_as_serialized_proto** method resturns serialized SentencePieceText proto. You can get the deserialized object by calling ParseFromString method.\n",
         "\n",
         "The definition of SentencePieceText proto is found [here](https://github.com/google/sentencepiece/blob/3be3f2e11e2bb923c579c6be5e7335809341587f/src/sentencepiece.proto#L23).\n"
@@ -1194,8 +1190,7 @@
       },
       "cell_type": "code",
       "source": [
-        "!pip install protobuf\n",
-        "!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py"
+        "!pip install protobuf"
       ],
       "execution_count": 0,
       "outputs": [
@@ -1233,8 +1228,7 @@
       },
       "cell_type": "code",
       "source": [
-        "import sentencepiece_pb2\n",
-        "import sentencepiece as spm\n",
+        "from sentencepiece import sentencepiece_pb2\n",
         "\n",
         "spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=m --vocab_size=2000')\n",
         "\n",