File: eval.py

package info (click to toggle)
whisper.cpp 1.8.3%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 32,228 kB
  • sloc: cpp: 188,765; ansic: 121,729; lisp: 10,221; sh: 4,272; objc: 2,159; ruby: 1,682; python: 1,177; javascript: 594; makefile: 144
file content (47 lines) | stat: -rw-r--r-- 1,205 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import glob
import jiwer
from normalizers import EnglishTextNormalizer

def get_reference():
    ref = {}
    for path in glob.glob('LibriSpeech/*/*/*/*.trans.txt'):
        with open(path) as fp:
            for line in fp:
                code, text = line.strip().split(" ", maxsplit=1)
                ref [code] = text
    return ref

def get_hypothesis():
    hyp = {}
    for path in glob.glob('LibriSpeech/*/*/*/*.flac.txt'):
        with open(path) as fp:
            text = fp.read().strip()
        code = os.path.basename(path).replace('.flac.txt', '')
        hyp[code] = text
    return hyp

def get_codes():
    codes = []
    for path in glob.glob('LibriSpeech/*/*/*/*.flac'):
        codes.append(os.path.basename(path).replace('.flac', ''))
    return sorted(codes)

def main():
    normalizer = EnglishTextNormalizer()

    ref_orig = get_reference()
    hyp_orig = get_hypothesis()

    ref_clean = []
    hyp_clean = []

    for code in get_codes():
        ref_clean.append(normalizer(ref_orig[code]))
        hyp_clean.append(normalizer(hyp_orig[code]))

    wer = jiwer.wer(ref_clean, hyp_clean)
    print(f"WER: {wer * 100:.2f}%")

if __name__ == '__main__':
    main()