File: benchmark_basic_english_normalize.py

package info (click to toggle)
pytorch-text 0.14.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 11,560 kB
  • sloc: python: 14,197; cpp: 2,404; sh: 214; makefile: 20
file content (37 lines) | stat: -rw-r--r-- 1,310 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import time

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import AG_NEWS
from torchtext.prototype.transforms import basic_english_normalize


def benchmark_basic_english_normalize():
    def _run_benchmark_lookup(train, tokenizer):
        t0 = time.monotonic()
        for (_, text) in train:
            tokenizer(text)
        print("Tokenization time:", time.monotonic() - t0)

    existing_basic_english_tokenizer = get_tokenizer("basic_english")
    experimental_basic_english_normalize = basic_english_normalize()
    experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize)

    # existing eager lookup
    train = AG_NEWS(split="train")
    print("BasicEnglishNormalize - Eager Mode")
    _run_benchmark_lookup(train, existing_basic_english_tokenizer)

    # experimental eager lookup
    train = AG_NEWS(split="train")
    print("BasicEnglishNormalize Experimental - Eager Mode")
    _run_benchmark_lookup(train, experimental_basic_english_normalize)

    # experimental jit lookup
    train = AG_NEWS(split="train")
    print("BasicEnglishNormalize Experimental - Jit Mode")
    _run_benchmark_lookup(train, experimental_jit_basic_english_normalize)


if __name__ == "__main__":
    benchmark_basic_english_normalize()