1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
/* eslint-disable */
var globRequire = require
console.log = (..._args: any[]) => {}
describe('quicktourExample', () => {
function require(mod: string) {
if (mod.startsWith('tokenizers')) {
return globRequire('../../')
} else {
return globRequire(mod)
}
}
it.skip('trains the tokenizer', async () => {
// START init_tokenizer
let { Tokenizer } = require('tokenizers')
let { BPE } = require('tokenizers')
let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' }))
// END init_tokenizer
// START init_trainer
let { bpeTrainer } = require('tokenizers')
let trainer = bpeTrainer({
specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'],
})
// END init_trainer
// START init_pretok
let { whitespacePreTokenizer } = require('tokenizers')
tokenizer.setPreTokenizer(whitespacePreTokenizer())
// END init_pretok
// START train
let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`)
tokenizer.train(files, trainer)
// END train
// START save
tokenizer.save('data/tokenizer-wiki.json')
// END save
})
it('shows a quicktour example', async () => {
let { Tokenizer } = require('tokenizers')
// START reload_tokenizer
let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json')
// END reload_tokenizer
// START encode
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
// END encode
// START print_tokens
console.log(output.getTokens())
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
// END print_tokens
expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?'])
// START print_ids
console.log(output.getIds())
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
// END print_ids
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35])
// START print_offsets
let offsets = output.getOffsets()
console.log(offsets[9])
// (26, 27)
// END print_offsets
expect(offsets[9]).toEqual([26, 27])
// START use_offsets
let { slice } = require('tokenizers')
let sentence = "Hello, y'all! How are you 😁 ?"
let [start, end] = offsets[9]
console.log(slice(sentence, start, end))
// "😁"
// END use_offsets
expect(slice(sentence, start, end)).toEqual('😁')
// START check_sep
console.log(tokenizer.tokenToId('[SEP]'))
// 2
// END check_sep
expect(tokenizer.tokenToId('[SEP]')).toEqual(2)
// START init_template_processing
let { templateProcessing } = require('tokenizers')
tokenizer.setPostProcessor(
templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
['[CLS]', tokenizer.tokenToId('[CLS]')],
['[SEP]', tokenizer.tokenToId('[SEP]')],
]),
)
// END init_template_processing
// START print_special_tokens
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
console.log(output.getTokens())
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens
expect(output.getTokens()).toEqual([
'[CLS]',
'Hello',
',',
'y',
"'",
'all',
'!',
'How',
'are',
'you',
'[UNK]',
'?',
'[SEP]',
])
// START print_special_tokens_pair
var output = await tokenizer.encode("Hello, y'all!", 'How are you 😁 ?')
console.log(output.getTokens())
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens_pair
expect(output.getTokens()).toEqual([
'[CLS]',
'Hello',
',',
'y',
"'",
'all',
'!',
'[SEP]',
'How',
'are',
'you',
'[UNK]',
'?',
'[SEP]',
])
// START print_type_ids
console.log(output.getTypeIds())
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
// END print_type_ids
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
// START encode_batch
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
// END encode_batch
// START encode_batch_pair
// var output = await tokenizer.encodeBatch(
// [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
// );
// END encode_batch_pair
// START enable_padding
tokenizer.setPadding({ padId: 3, padToken: '[PAD]' })
// END enable_padding
// START print_batch_tokens
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
console.log(output[1].getTokens())
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
// END print_batch_tokens
expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]'])
// START print_attention_mask
console.log(output[1].getAttentionMask())
// [1, 1, 1, 1, 1, 1, 1, 0]
// END print_attention_mask
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0])
})
})
|