1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/env python
"""Parse a directory contains VoxForge dataset.
Recursively search for "PROMPTS" file in the given directory and print out
`<ID>\\t<AUDIO_PATH>\\t<TRANSCRIPTION>`
example: python parse_voxforge.py voxforge/de/Helge-20150608-aku
de5-001\t/datasets/voxforge/de/guenter-20140214-afn/wav/de5-001.wav\tES SOLL ETWA FÜNFZIGTAUSEND VERSCHIEDENE SORTEN GEBEN
...
Dataset can be obtained from http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/Main/16kHz_16bit/
""" # noqa: E501
import argparse
import os
from pathlib import Path
def _parse_args():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("input_dir", type=Path, help="Directory where `*.trans.txt` files are searched.")
return parser.parse_args()
def _parse_prompts(path):
base_dir = path.parent.parent
with open(path) as trans_fileobj:
for line in trans_fileobj:
line = line.strip()
if not line:
continue
id_, transcript = line.split(" ", maxsplit=1)
if not transcript:
continue
transcript = transcript.upper()
filename = id_.split("/")[-1]
audio_path = base_dir / "wav" / f"{filename}.wav"
if os.path.exists(audio_path):
yield id_, audio_path, transcript
def _parse_directory(root_dir: Path):
for prompt_file in root_dir.glob("**/PROMPTS"):
try:
yield from _parse_prompts(prompt_file)
except UnicodeDecodeError:
pass
def _main():
args = _parse_args()
for id_, path, transcription in _parse_directory(args.input_dir):
print(f"{id_}\t{path}\t{transcription}")
if __name__ == "__main__":
_main()
|