1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
import csv
import os
from pathlib import Path
from typing import Dict, Tuple
from torch import Tensor
from torchaudio.datasets import COMMONVOICE
from torchaudio_unittest.common_utils import get_whitenoise, normalize_wav, save_wav, TempDirMixin, TorchaudioTestCase
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000
_HEADERS = ["client_ids", "path", "sentence", "up_votes", "down_votes", "age", "gender", "accent"]
_EN_TRAIN_CSV_CONTENTS = [
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
"common_voice_en_18885784.wav",
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.",
"2",
"0",
"",
"",
"",
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
"common_voice_en_556542.wav",
"Once more into the breach",
"2",
"0",
"thirties",
"male",
"us",
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.",
"2",
"0",
"twenties",
"male",
"canada",
],
]
_FR_TRAIN_CSV_CONTENTS = [
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
"18343441c601cae0597a4b0d3144",
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329",
"Or sur ce point nous n’avons aucune réponse de votre part.",
"2",
"0",
"twenties",
"male",
"france",
],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144",
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd",
"Monsieur de La Verpillière, laissez parler le ministre",
"2",
"0",
"twenties",
"male",
"france",
],
]
def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares mocked dataset
"""
mocked_data = []
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename = os.path.join(root_dir, "train.tsv")
audio_base_path = os.path.join(root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline="") as tsv:
writer = csv.writer(tsv, delimiter="\t")
writer.writerow(_HEADERS)
for i, content in enumerate(train_csv_contents):
content[2] = str(content[2].encode("utf-8"))
writer.writerow(content)
if not content[1].endswith(ext_audio):
audio_path = os.path.join(audio_base_path, content[1] + ext_audio)
else:
audio_path = os.path.join(audio_base_path, content[1])
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype="float32")
save_wav(audio_path, data, _SAMPLE_RATE)
# Append data entry
mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
return mocked_data
def get_mock_dataset_en(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares english mocked dataset
"""
return get_mock_dataset(root_dir, _EN_TRAIN_CSV_CONTENTS, ext_audio)
def get_mock_dataset_fr(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
"""
prepares french mocked dataset
"""
return get_mock_dataset(root_dir, _FR_TRAIN_CSV_CONTENTS, ext_audio)
class BaseTestCommonVoice(TempDirMixin):
root_dir = None
data = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.root_dir = cls.get_base_temp_dir()
COMMONVOICE._ext_audio = ".wav"
@classmethod
def tearDownClass(cls):
super().tearDownClass()
COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
def _test_commonvoice(self, dataset):
n_ite = 0
for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
expected_dictionary = self.data[i][2]
expected_data = self.data[i][0]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == _SAMPLE_RATE
assert dictionary == expected_dictionary
n_ite += 1
assert n_ite == len(self.data)
class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
root_dir = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.data = get_mock_dataset_en(cls.root_dir, COMMONVOICE._ext_audio)
def test_commonvoice_str(self):
dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset)
def test_commonvoice_path(self):
dataset = COMMONVOICE(Path(self.root_dir))
self._test_commonvoice(dataset)
class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
root_dir = None
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.data = get_mock_dataset_fr(cls.root_dir, COMMONVOICE._ext_audio)
def test_commonvoice_str(self):
dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset)
|