File: quicktour.test.ts

package info (click to toggle)
tokenizers 0.20.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 5,480 kB
  • sloc: python: 4,499; javascript: 419; makefile: 124
file content (163 lines) | stat: -rw-r--r-- 5,293 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/* eslint-disable */
var globRequire = require

console.log = (..._args: any[]) => {}

describe('quicktourExample', () => {
  function require(mod: string) {
    if (mod.startsWith('tokenizers')) {
      return globRequire('../../')
    } else {
      return globRequire(mod)
    }
  }

  it.skip('trains the tokenizer', async () => {
    // START init_tokenizer
    let { Tokenizer } = require('tokenizers')
    let { BPE } = require('tokenizers')

    let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' }))
    // END init_tokenizer
    // START init_trainer
    let { bpeTrainer } = require('tokenizers')

    let trainer = bpeTrainer({
      specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'],
    })
    // END init_trainer
    // START init_pretok
    let { whitespacePreTokenizer } = require('tokenizers')

    tokenizer.setPreTokenizer(whitespacePreTokenizer())
    // END init_pretok
    // START train
    let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`)
    tokenizer.train(files, trainer)
    // END train
    // START save
    tokenizer.save('data/tokenizer-wiki.json')
    // END save
  })

  it('shows a quicktour example', async () => {
    let { Tokenizer } = require('tokenizers')

    // START reload_tokenizer
    let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json')
    // END reload_tokenizer
    // START encode

    var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
    // END encode
    // START print_tokens
    console.log(output.getTokens())
    // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
    // END print_tokens
    expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?'])
    // START print_ids
    console.log(output.getIds())
    // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
    // END print_ids
    expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35])
    // START print_offsets
    let offsets = output.getOffsets()
    console.log(offsets[9])
    // (26, 27)
    // END print_offsets
    expect(offsets[9]).toEqual([26, 27])
    // START use_offsets
    let { slice } = require('tokenizers')

    let sentence = "Hello, y'all! How are you 😁 ?"
    let [start, end] = offsets[9]
    console.log(slice(sentence, start, end))
    // "😁"
    // END use_offsets
    expect(slice(sentence, start, end)).toEqual('😁')
    // START check_sep
    console.log(tokenizer.tokenToId('[SEP]'))
    // 2
    // END check_sep
    expect(tokenizer.tokenToId('[SEP]')).toEqual(2)
    // START init_template_processing
    let { templateProcessing } = require('tokenizers')

    tokenizer.setPostProcessor(
      templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
        ['[CLS]', tokenizer.tokenToId('[CLS]')],
        ['[SEP]', tokenizer.tokenToId('[SEP]')],
      ]),
    )
    // END init_template_processing
    // START print_special_tokens
    var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
    console.log(output.getTokens())
    // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
    // END print_special_tokens
    expect(output.getTokens()).toEqual([
      '[CLS]',
      'Hello',
      ',',
      'y',
      "'",
      'all',
      '!',
      'How',
      'are',
      'you',
      '[UNK]',
      '?',
      '[SEP]',
    ])
    // START print_special_tokens_pair
    var output = await tokenizer.encode("Hello, y'all!", 'How are you 😁 ?')
    console.log(output.getTokens())
    // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
    // END print_special_tokens_pair
    expect(output.getTokens()).toEqual([
      '[CLS]',
      'Hello',
      ',',
      'y',
      "'",
      'all',
      '!',
      '[SEP]',
      'How',
      'are',
      'you',
      '[UNK]',
      '?',
      '[SEP]',
    ])
    // START print_type_ids
    console.log(output.getTypeIds())
    // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
    // END print_type_ids
    expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    // START encode_batch

    var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
    // END encode_batch
    // START encode_batch_pair
    // var output = await tokenizer.encodeBatch(
    //     [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
    // );
    // END encode_batch_pair
    // START enable_padding
    tokenizer.setPadding({ padId: 3, padToken: '[PAD]' })
    // END enable_padding
    // START print_batch_tokens
    var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
    console.log(output[1].getTokens())
    // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
    // END print_batch_tokens
    expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]'])
    // START print_attention_mask
    console.log(output[1].getAttentionMask())
    // [1, 1, 1, 1, 1, 1, 1, 0]
    // END print_attention_mask
    expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0])
  })
})