1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
import torch
from dataset.hubert_dataset import _crop_audio_label
from parameterized import parameterized
from torchaudio.models import hubert_base
from torchaudio_unittest.common_utils import get_whitenoise, TorchaudioTestCase
class TestCropAudioLabel(TorchaudioTestCase):
@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
@parameterized.expand(
[
(400,),
(800,),
]
)
def test_zero_offset(self, num_frames):
"""Test _crop_audio_label method with zero frame offset.
Given the ``num_frames`` argument, the method returns the first ``num_frames`` samples in the waveform,
the corresponding labels, and the length of the cropped waveform.
The cropped waveform should be identical to the first ``num_frames`` samples of original waveform.
The length of the cropped waveform should be identical to ``num_frames``.
The dimension of the labels should be identical to HuBERT transformer layer output frame dimension.
"""
sample_rate = 16000
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05)
length = waveform.shape[1]
label = torch.rand(50)
model = hubert_base()
waveform_out, label_out, length = _crop_audio_label(waveform, label, length, num_frames, rand_crop=False)
hubert_feat = model.extract_features(waveform_out.unsqueeze(0), num_layers=1)[0][0]
self.assertEqual(waveform_out.shape[0], num_frames, length)
self.assertEqual(waveform_out, waveform[0, :num_frames])
self.assertEqual(label_out.shape[0], hubert_feat.shape[1])
@parameterized.expand(
[
(400,),
(800,),
]
)
def test_rand_crop(self, num_frames):
"""Test _crop_audio_label method with random frame offset.
Given the ``num_frames`` argument, the method returns ``num_frames`` samples in the waveform
starting with random offset, the corresponding labels, and the length of the cropped waveform.
The length of the cropped waveform should be identical to ``num_frames``.
The dimension of the labels should be identical to HuBERT transformer layer output frame dimension.
"""
sample_rate = 16000
waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05)
length = waveform.shape[1]
label = torch.rand(50)
model = hubert_base()
waveform_out, label_out, length = _crop_audio_label(waveform, label, length, num_frames, rand_crop=False)
hubert_feat = model.extract_features(waveform_out.unsqueeze(0), num_layers=1)[0][0]
self.assertEqual(waveform_out.shape[0], num_frames, length)
self.assertEqual(label_out.shape[0], hubert_feat.shape[1])
|