File: test-tokenizer-0.sh

package info (click to toggle)
llama.cpp 7593%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 71,012 kB
  • sloc: cpp: 329,391; ansic: 48,249; python: 32,103; lisp: 10,053; sh: 6,070; objc: 1,349; javascript: 924; xml: 384; makefile: 233
file content (41 lines) | stat: -rwxr-xr-x 929 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env bash
#
# Usage:
#
#   test-tokenizer-0.sh <name> <input>
#

if [ $# -ne 2 ]; then
    printf "Usage: $0 <name> <input>\n"
    exit 1
fi

name=$1
input=$2

make -j tests/test-tokenizer-0

printf "Testing %s on %s ...\n" $name $input

set -e

printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1

printf "Tokenizing using (cpp) llama.cpp ...\n"
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1

cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"

set +e

diff $input.tok $input.tokcpp > /dev/null 2>&1

if [ $? -eq 0 ]; then
    printf "Tokenization is correct!\n"
else
    diff $input.tok $input.tokcpp | head -n 32

    printf "Tokenization differs!\n"
fi