1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
|
#!/usr/bin/env python3
"""This is the preprocessing script for HuBERT model training.
The script includes:
- File list creation
- MFCC/HuBERT feature extraction
- KMeans clustering model training
- Pseudo-label generation
"""
import logging
from argparse import ArgumentParser, RawTextHelpFormatter
from pathlib import Path
import torch
from utils import create_tsv, dump_features, get_km_label, learn_kmeans
def _init_logger(debug=False):
message_fmt = "%(levelname)5s: %(funcName)10s: %(message)s" if debug else "%(message)s"
logging.basicConfig(
level=logging.DEBUG if debug else logging.INFO,
format=f"%(asctime)s: {message_fmt}",
)
def _parse_args():
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("--debug", action="store_true", help="Enable debug log")
parser.add_argument("--dataset", default="librispeech", type=str, choices=["librispeech", "librilight"])
parser.add_argument(
"--root-dir",
type=Path,
help="The path to the directory where the directory ``LibriSpeech`` or ``LibriLight`` is stored.",
)
parser.add_argument("--num-rank", default=5, type=int)
parser.add_argument("--feat-type", default="mfcc", choices=["mfcc", "hubert"], type=str)
parser.add_argument(
"--layer-index",
default=6,
type=int,
help="The layer index in HuBERT model for feature extraction. (``1`` means the first layer output)",
)
parser.add_argument(
"--checkpoint-path",
default=None,
type=Path,
help="The model checkpoint of hubert_pretrain_base model.",
)
parser.add_argument("--use-gpu", default=False, type=bool)
parser.add_argument(
"--exp-dir",
type=Path,
help="The directory to store the experiment outputs.",
)
parser.add_argument(
"--num-cluster",
default=100,
type=int,
help="The number of clusters for KMeans clustering.",
)
parser.add_argument(
"--percent",
default=-1,
type=float,
help="The percent of data for KMeans clustering. If negative, use all data. (Default: -1)",
)
args = parser.parse_args()
return args
def main(args):
_init_logger(args.debug)
if not args.exp_dir.exists():
args.exp_dir.mkdir()
if args.feat_type == "mfcc":
data_dir = args.exp_dir / "data" / "mfcc"
else:
data_dir = args.exp_dir / "data" / f"{args.feat_type}_{args.layer_index}"
data_dir.mkdir(parents=True, exist_ok=True)
tsv_dir = data_dir / "tsv"
feat_dir = data_dir / "feat"
km_dir = data_dir / "km_model"
label_dir = data_dir / "label"
if args.use_gpu:
device = torch.device("cuda")
else:
device = torch.device("cpu")
# Create file lists for training and validation (optional)
create_tsv(args.root_dir, tsv_dir)
# Extract features for KMeans clustering
if not feat_dir.exists():
feat_dir.mkdir()
for split in ["train", "valid"]:
for rank in range(1, args.num_rank + 1):
dump_features(
tsv_dir / f"{args.dataset}_{split}.tsv",
feat_dir,
split,
rank,
args.num_rank,
device,
args.feat_type,
args.layer_index,
args.checkpoint_path,
16_000,
)
# Fit KMeans clustering model
learn_kmeans(
feat_dir,
"train",
args.num_rank,
km_dir,
args.num_cluster,
args.percent,
)
# Predict labels for MFCC or HuBERT features
for split in ["train", "valid"]:
get_km_label(
feat_dir,
km_dir,
label_dir,
split,
args.num_rank,
device,
)
if __name__ == "__main__":
main(_parse_args())
|