File: eval.py

package info (click to toggle)

whisper.cpp 1.8.3%2Bdfsg-2

links: PTS, VCS
area: main
in suites: sid
size: 32,228 kB
sloc: cpp: 188,765; ansic: 121,729; lisp: 10,221; sh: 4,272; objc: 2,159; ruby: 1,682; python: 1,177; javascript: 594; makefile: 144

file content (47 lines) | stat: -rw-r--r-- 1,205 bytes

import os
import glob
import jiwer
from normalizers import EnglishTextNormalizer

def get_reference():
    ref = {}
    for path in glob.glob('LibriSpeech/*/*/*/*.trans.txt'):
        with open(path) as fp:
            for line in fp:
                code, text = line.strip().split(" ", maxsplit=1)
                ref [code] = text
    return ref

def get_hypothesis():
    hyp = {}
    for path in glob.glob('LibriSpeech/*/*/*/*.flac.txt'):
        with open(path) as fp:
            text = fp.read().strip()
        code = os.path.basename(path).replace('.flac.txt', '')
        hyp[code] = text
    return hyp

def get_codes():
    codes = []
    for path in glob.glob('LibriSpeech/*/*/*/*.flac'):
        codes.append(os.path.basename(path).replace('.flac', ''))
    return sorted(codes)

def main():
    normalizer = EnglishTextNormalizer()

    ref_orig = get_reference()
    hyp_orig = get_hypothesis()

    ref_clean = []
    hyp_clean = []

    for code in get_codes():
        ref_clean.append(normalizer(ref_orig[code]))
        hyp_clean.append(normalizer(hyp_orig[code]))

    wer = jiwer.wer(ref_clean, hyp_clean)
    print(f"WER: {wer * 100:.2f}%")

if __name__ == '__main__':
    main()